Home | History | Annotate | Download | only in pcre
      1 // Copyright (c) 2010, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 //
     30 // Author: Sanjay Ghemawat
     31 
     32 #ifdef HAVE_CONFIG_H
     33 #include "config.h"
     34 #endif
     35 
     36 #include <stdlib.h>
     37 #include <stdio.h>
     38 #include <ctype.h>
     39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
     40 #include <assert.h>
     41 #include <errno.h>
     42 #include <string>
     43 #include <algorithm>
     44 
     45 #include "pcrecpp_internal.h"
     46 #include "pcre.h"
     47 #include "pcrecpp.h"
     48 #include "pcre_stringpiece.h"
     49 
     50 
     51 namespace pcrecpp {
     52 
     53 // Maximum number of args we can set
     54 static const int kMaxArgs = 16;
     55 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
     56 
     57 // Special object that stands-in for no argument
     58 Arg RE::no_arg((void*)NULL);
     59 
     60 // This is for ABI compatibility with old versions of pcre (pre-7.6),
     61 // which defined a global no_arg variable instead of putting it in the
     62 // RE class.  This works on GCC >= 3, at least.  It definitely works
     63 // for ELF, but may not for other object formats (Mach-O, for
     64 // instance, does not support aliases.)  We could probably have a more
     65 // inclusive test if we ever needed it.  (Note that not only the
     66 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
     67 // gnu-specific.)
     68 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
     69 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
     70 # define ULP_AS_STRING_INTERNAL(x)   #x
     71 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
     72 extern Arg no_arg
     73   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
     74 #endif
     75 
     76 // If a regular expression has no error, its error_ field points here
     77 static const string empty_string;
     78 
     79 // If the user doesn't ask for any options, we just use this one
     80 static RE_Options default_options;
     81 
     82 void RE::Init(const string& pat, const RE_Options* options) {
     83   pattern_ = pat;
     84   if (options == NULL) {
     85     options_ = default_options;
     86   } else {
     87     options_ = *options;
     88   }
     89   error_ = &empty_string;
     90   re_full_ = NULL;
     91   re_partial_ = NULL;
     92 
     93   re_partial_ = Compile(UNANCHORED);
     94   if (re_partial_ != NULL) {
     95     re_full_ = Compile(ANCHOR_BOTH);
     96   }
     97 }
     98 
     99 void RE::Cleanup() {
    100   if (re_full_ != NULL)         (*pcre_free)(re_full_);
    101   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
    102   if (error_ != &empty_string)  delete error_;
    103 }
    104 
    105 
    106 RE::~RE() {
    107   Cleanup();
    108 }
    109 
    110 
    111 pcre* RE::Compile(Anchor anchor) {
    112   // First, convert RE_Options into pcre options
    113   int pcre_options = 0;
    114   pcre_options = options_.all_options();
    115 
    116   // Special treatment for anchoring.  This is needed because at
    117   // runtime pcre only provides an option for anchoring at the
    118   // beginning of a string (unless you use offset).
    119   //
    120   // There are three types of anchoring we want:
    121   //    UNANCHORED      Compile the original pattern, and use
    122   //                    a pcre unanchored match.
    123   //    ANCHOR_START    Compile the original pattern, and use
    124   //                    a pcre anchored match.
    125   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
    126   //                    and use a pcre anchored match.
    127 
    128   const char* compile_error;
    129   int eoffset;
    130   pcre* re;
    131   if (anchor != ANCHOR_BOTH) {
    132     re = pcre_compile(pattern_.c_str(), pcre_options,
    133                       &compile_error, &eoffset, NULL);
    134   } else {
    135     // Tack a '\z' at the end of RE.  Parenthesize it first so that
    136     // the '\z' applies to all top-level alternatives in the regexp.
    137     string wrapped = "(?:";  // A non-counting grouping operator
    138     wrapped += pattern_;
    139     wrapped += ")\\z";
    140     re = pcre_compile(wrapped.c_str(), pcre_options,
    141                       &compile_error, &eoffset, NULL);
    142   }
    143   if (re == NULL) {
    144     if (error_ == &empty_string) error_ = new string(compile_error);
    145   }
    146   return re;
    147 }
    148 
    149 /***** Matching interfaces *****/
    150 
    151 bool RE::FullMatch(const StringPiece& text,
    152                    const Arg& ptr1,
    153                    const Arg& ptr2,
    154                    const Arg& ptr3,
    155                    const Arg& ptr4,
    156                    const Arg& ptr5,
    157                    const Arg& ptr6,
    158                    const Arg& ptr7,
    159                    const Arg& ptr8,
    160                    const Arg& ptr9,
    161                    const Arg& ptr10,
    162                    const Arg& ptr11,
    163                    const Arg& ptr12,
    164                    const Arg& ptr13,
    165                    const Arg& ptr14,
    166                    const Arg& ptr15,
    167                    const Arg& ptr16) const {
    168   const Arg* args[kMaxArgs];
    169   int n = 0;
    170   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    171   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    172   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    173   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    174   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    175   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    176   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    177   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    178   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    179   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    180   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    181   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    182   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    183   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    184   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    185   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    186  done:
    187 
    188   int consumed;
    189   int vec[kVecSize];
    190   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
    191 }
    192 
    193 bool RE::PartialMatch(const StringPiece& text,
    194                       const Arg& ptr1,
    195                       const Arg& ptr2,
    196                       const Arg& ptr3,
    197                       const Arg& ptr4,
    198                       const Arg& ptr5,
    199                       const Arg& ptr6,
    200                       const Arg& ptr7,
    201                       const Arg& ptr8,
    202                       const Arg& ptr9,
    203                       const Arg& ptr10,
    204                       const Arg& ptr11,
    205                       const Arg& ptr12,
    206                       const Arg& ptr13,
    207                       const Arg& ptr14,
    208                       const Arg& ptr15,
    209                       const Arg& ptr16) const {
    210   const Arg* args[kMaxArgs];
    211   int n = 0;
    212   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    213   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    214   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    215   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    216   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    217   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    218   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    219   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    220   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    221   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    222   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    223   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    224   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    225   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    226   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    227   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    228  done:
    229 
    230   int consumed;
    231   int vec[kVecSize];
    232   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
    233 }
    234 
    235 bool RE::Consume(StringPiece* input,
    236                  const Arg& ptr1,
    237                  const Arg& ptr2,
    238                  const Arg& ptr3,
    239                  const Arg& ptr4,
    240                  const Arg& ptr5,
    241                  const Arg& ptr6,
    242                  const Arg& ptr7,
    243                  const Arg& ptr8,
    244                  const Arg& ptr9,
    245                  const Arg& ptr10,
    246                  const Arg& ptr11,
    247                  const Arg& ptr12,
    248                  const Arg& ptr13,
    249                  const Arg& ptr14,
    250                  const Arg& ptr15,
    251                  const Arg& ptr16) const {
    252   const Arg* args[kMaxArgs];
    253   int n = 0;
    254   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    255   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    256   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    257   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    258   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    259   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    260   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    261   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    262   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    263   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    264   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    265   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    266   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    267   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    268   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    269   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    270  done:
    271 
    272   int consumed;
    273   int vec[kVecSize];
    274   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
    275                   args, n, vec, kVecSize)) {
    276     input->remove_prefix(consumed);
    277     return true;
    278   } else {
    279     return false;
    280   }
    281 }
    282 
    283 bool RE::FindAndConsume(StringPiece* input,
    284                         const Arg& ptr1,
    285                         const Arg& ptr2,
    286                         const Arg& ptr3,
    287                         const Arg& ptr4,
    288                         const Arg& ptr5,
    289                         const Arg& ptr6,
    290                         const Arg& ptr7,
    291                         const Arg& ptr8,
    292                         const Arg& ptr9,
    293                         const Arg& ptr10,
    294                         const Arg& ptr11,
    295                         const Arg& ptr12,
    296                         const Arg& ptr13,
    297                         const Arg& ptr14,
    298                         const Arg& ptr15,
    299                         const Arg& ptr16) const {
    300   const Arg* args[kMaxArgs];
    301   int n = 0;
    302   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    303   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    304   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    305   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    306   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    307   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    308   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    309   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    310   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    311   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    312   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    313   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    314   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    315   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    316   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    317   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    318  done:
    319 
    320   int consumed;
    321   int vec[kVecSize];
    322   if (DoMatchImpl(*input, UNANCHORED, &consumed,
    323                   args, n, vec, kVecSize)) {
    324     input->remove_prefix(consumed);
    325     return true;
    326   } else {
    327     return false;
    328   }
    329 }
    330 
    331 bool RE::Replace(const StringPiece& rewrite,
    332                  string *str) const {
    333   int vec[kVecSize];
    334   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
    335   if (matches == 0)
    336     return false;
    337 
    338   string s;
    339   if (!Rewrite(&s, rewrite, *str, vec, matches))
    340     return false;
    341 
    342   assert(vec[0] >= 0);
    343   assert(vec[1] >= 0);
    344   str->replace(vec[0], vec[1] - vec[0], s);
    345   return true;
    346 }
    347 
    348 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
    349 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
    350 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
    351 
    352 static int NewlineMode(int pcre_options) {
    353   // TODO: if we can make it threadsafe, cache this var
    354   int newline_mode = 0;
    355   /* if (newline_mode) return newline_mode; */  // do this once it's cached
    356   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
    357                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
    358     newline_mode = (pcre_options &
    359                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
    360                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
    361   } else {
    362     int newline;
    363     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
    364     if (newline == 10)
    365       newline_mode = PCRE_NEWLINE_LF;
    366     else if (newline == 13)
    367       newline_mode = PCRE_NEWLINE_CR;
    368     else if (newline == 3338)
    369       newline_mode = PCRE_NEWLINE_CRLF;
    370     else if (newline == -1)
    371       newline_mode = PCRE_NEWLINE_ANY;
    372     else if (newline == -2)
    373       newline_mode = PCRE_NEWLINE_ANYCRLF;
    374     else
    375       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
    376   }
    377   return newline_mode;
    378 }
    379 
    380 int RE::GlobalReplace(const StringPiece& rewrite,
    381                       string *str) const {
    382   int count = 0;
    383   int vec[kVecSize];
    384   string out;
    385   int start = 0;
    386   int lastend = -1;
    387   bool last_match_was_empty_string = false;
    388 
    389   while (start <= static_cast<int>(str->length())) {
    390     // If the previous match was for the empty string, we shouldn't
    391     // just match again: we'll match in the same way and get an
    392     // infinite loop.  Instead, we do the match in a special way:
    393     // anchored -- to force another try at the same position --
    394     // and with a flag saying that this time, ignore empty matches.
    395     // If this special match returns, that means there's a non-empty
    396     // match at this position as well, and we can continue.  If not,
    397     // we do what perl does, and just advance by one.
    398     // Notice that perl prints '@@@' for this;
    399     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
    400     int matches;
    401     if (last_match_was_empty_string) {
    402       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
    403       if (matches <= 0) {
    404         int matchend = start + 1;     // advance one character.
    405         // If the current char is CR and we're in CRLF mode, skip LF too.
    406         // Note it's better to call pcre_fullinfo() than to examine
    407         // all_options(), since options_ could have changed bewteen
    408         // compile-time and now, but this is simpler and safe enough.
    409         // Modified by PH to add ANY and ANYCRLF.
    410         if (matchend < static_cast<int>(str->length()) &&
    411             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
    412             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
    413              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
    414              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
    415           matchend++;
    416         }
    417         // We also need to advance more than one char if we're in utf8 mode.
    418 #ifdef SUPPORT_UTF8
    419         if (options_.utf8()) {
    420           while (matchend < static_cast<int>(str->length()) &&
    421                  ((*str)[matchend] & 0xc0) == 0x80)
    422             matchend++;
    423         }
    424 #endif
    425         if (start < static_cast<int>(str->length()))
    426           out.append(*str, start, matchend - start);
    427         start = matchend;
    428         last_match_was_empty_string = false;
    429         continue;
    430       }
    431     } else {
    432       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
    433       if (matches <= 0)
    434         break;
    435     }
    436     int matchstart = vec[0], matchend = vec[1];
    437     assert(matchstart >= start);
    438     assert(matchend >= matchstart);
    439     out.append(*str, start, matchstart - start);
    440     Rewrite(&out, rewrite, *str, vec, matches);
    441     start = matchend;
    442     lastend = matchend;
    443     count++;
    444     last_match_was_empty_string = (matchstart == matchend);
    445   }
    446 
    447   if (count == 0)
    448     return 0;
    449 
    450   if (start < static_cast<int>(str->length()))
    451     out.append(*str, start, str->length() - start);
    452   swap(out, *str);
    453   return count;
    454 }
    455 
    456 bool RE::Extract(const StringPiece& rewrite,
    457                  const StringPiece& text,
    458                  string *out) const {
    459   int vec[kVecSize];
    460   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
    461   if (matches == 0)
    462     return false;
    463   out->erase();
    464   return Rewrite(out, rewrite, text, vec, matches);
    465 }
    466 
    467 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
    468   string result;
    469 
    470   // Escape any ascii character not in [A-Za-z_0-9].
    471   //
    472   // Note that it's legal to escape a character even if it has no
    473   // special meaning in a regular expression -- so this function does
    474   // that.  (This also makes it identical to the perl function of the
    475   // same name; see `perldoc -f quotemeta`.)  The one exception is
    476   // escaping NUL: rather than doing backslash + NUL, like perl does,
    477   // we do '\0', because pcre itself doesn't take embedded NUL chars.
    478   for (int ii = 0; ii < unquoted.size(); ++ii) {
    479     // Note that using 'isalnum' here raises the benchmark time from
    480     // 32ns to 58ns:
    481     if (unquoted[ii] == '\0') {
    482       result += "\\0";
    483     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    484                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    485                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    486                unquoted[ii] != '_' &&
    487                // If this is the part of a UTF8 or Latin1 character, we need
    488                // to copy this byte without escaping.  Experimentally this is
    489                // what works correctly with the regexp library.
    490                !(unquoted[ii] & 128)) {
    491       result += '\\';
    492       result += unquoted[ii];
    493     } else {
    494       result += unquoted[ii];
    495     }
    496   }
    497 
    498   return result;
    499 }
    500 
    501 /***** Actual matching and rewriting code *****/
    502 
    503 int RE::TryMatch(const StringPiece& text,
    504                  int startpos,
    505                  Anchor anchor,
    506                  bool empty_ok,
    507                  int *vec,
    508                  int vecsize) const {
    509   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
    510   if (re == NULL) {
    511     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
    512     return 0;
    513   }
    514 
    515   pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
    516   if (options_.match_limit() > 0) {
    517     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
    518     extra.match_limit = options_.match_limit();
    519   }
    520   if (options_.match_limit_recursion() > 0) {
    521     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
    522     extra.match_limit_recursion = options_.match_limit_recursion();
    523   }
    524 
    525   int options = 0;
    526   if (anchor != UNANCHORED)
    527     options |= PCRE_ANCHORED;
    528   if (!empty_ok)
    529     options |= PCRE_NOTEMPTY;
    530 
    531   int rc = pcre_exec(re,              // The regular expression object
    532                      &extra,
    533                      (text.data() == NULL) ? "" : text.data(),
    534                      text.size(),
    535                      startpos,
    536                      options,
    537                      vec,
    538                      vecsize);
    539 
    540   // Handle errors
    541   if (rc == PCRE_ERROR_NOMATCH) {
    542     return 0;
    543   } else if (rc < 0) {
    544     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
    545     //        re, pattern_.c_str());
    546     return 0;
    547   } else if (rc == 0) {
    548     // pcre_exec() returns 0 as a special case when the number of
    549     // capturing subpatterns exceeds the size of the vector.
    550     // When this happens, there is a match and the output vector
    551     // is filled, but we miss out on the positions of the extra subpatterns.
    552     rc = vecsize / 2;
    553   }
    554 
    555   return rc;
    556 }
    557 
    558 bool RE::DoMatchImpl(const StringPiece& text,
    559                      Anchor anchor,
    560                      int* consumed,
    561                      const Arg* const* args,
    562                      int n,
    563                      int* vec,
    564                      int vecsize) const {
    565   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
    566   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
    567   assert(matches >= 0);  // TryMatch never returns negatives
    568   if (matches == 0)
    569     return false;
    570 
    571   *consumed = vec[1];
    572 
    573   if (n == 0 || args == NULL) {
    574     // We are not interested in results
    575     return true;
    576   }
    577 
    578   if (NumberOfCapturingGroups() < n) {
    579     // RE has fewer capturing groups than number of arg pointers passed in
    580     return false;
    581   }
    582 
    583   // If we got here, we must have matched the whole pattern.
    584   // We do not need (can not do) any more checks on the value of 'matches' here
    585   // -- see the comment for TryMatch.
    586   for (int i = 0; i < n; i++) {
    587     const int start = vec[2*(i+1)];
    588     const int limit = vec[2*(i+1)+1];
    589     if (!args[i]->Parse(text.data() + start, limit-start)) {
    590       // TODO: Should we indicate what the error was?
    591       return false;
    592     }
    593   }
    594 
    595   return true;
    596 }
    597 
    598 bool RE::DoMatch(const StringPiece& text,
    599                  Anchor anchor,
    600                  int* consumed,
    601                  const Arg* const args[],
    602                  int n) const {
    603   assert(n >= 0);
    604   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
    605                                        // (as for kVecSize)
    606   int space[21];   // use stack allocation for small vecsize (common case)
    607   int* vec = vecsize <= 21 ? space : new int[vecsize];
    608   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
    609   if (vec != space) delete [] vec;
    610   return retval;
    611 }
    612 
    613 bool RE::Rewrite(string *out, const StringPiece &rewrite,
    614                  const StringPiece &text, int *vec, int veclen) const {
    615   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    616        s < end; s++) {
    617     int c = *s;
    618     if (c == '\\') {
    619       c = *++s;
    620       if (isdigit(c)) {
    621         int n = (c - '0');
    622         if (n >= veclen) {
    623           //fprintf(stderr, requested group %d in regexp %.*s\n",
    624           //        n, rewrite.size(), rewrite.data());
    625           return false;
    626         }
    627         int start = vec[2 * n];
    628         if (start >= 0)
    629           out->append(text.data() + start, vec[2 * n + 1] - start);
    630       } else if (c == '\\') {
    631         *out += '\\';
    632       } else {
    633         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
    634         //        rewrite.size(), rewrite.data());
    635         return false;
    636       }
    637     } else {
    638       *out += c;
    639     }
    640   }
    641   return true;
    642 }
    643 
    644 // Return the number of capturing subpatterns, or -1 if the
    645 // regexp wasn't valid on construction.
    646 int RE::NumberOfCapturingGroups() const {
    647   if (re_partial_ == NULL) return -1;
    648 
    649   int result;
    650   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
    651                                   NULL,         // We did not study the pattern
    652                                   PCRE_INFO_CAPTURECOUNT,
    653                                   &result);
    654   assert(pcre_retval == 0);
    655   return result;
    656 }
    657 
    658 /***** Parsers for various types *****/
    659 
    660 bool Arg::parse_null(const char* str, int n, void* dest) {
    661   // We fail if somebody asked us to store into a non-NULL void* pointer
    662   return (dest == NULL);
    663 }
    664 
    665 bool Arg::parse_string(const char* str, int n, void* dest) {
    666   if (dest == NULL) return true;
    667   reinterpret_cast<string*>(dest)->assign(str, n);
    668   return true;
    669 }
    670 
    671 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
    672   if (dest == NULL) return true;
    673   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    674   return true;
    675 }
    676 
    677 bool Arg::parse_char(const char* str, int n, void* dest) {
    678   if (n != 1) return false;
    679   if (dest == NULL) return true;
    680   *(reinterpret_cast<char*>(dest)) = str[0];
    681   return true;
    682 }
    683 
    684 bool Arg::parse_uchar(const char* str, int n, void* dest) {
    685   if (n != 1) return false;
    686   if (dest == NULL) return true;
    687   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    688   return true;
    689 }
    690 
    691 // Largest number spec that we are willing to parse
    692 static const int kMaxNumberLength = 32;
    693 
    694 // REQUIRES "buf" must have length at least kMaxNumberLength+1
    695 // REQUIRES "n > 0"
    696 // Copies "str" into "buf" and null-terminates if necessary.
    697 // Returns one of:
    698 //      a. "str" if no termination is needed
    699 //      b. "buf" if the string was copied and null-terminated
    700 //      c. "" if the input was invalid and has no hope of being parsed
    701 static const char* TerminateNumber(char* buf, const char* str, int n) {
    702   if ((n > 0) && isspace(*str)) {
    703     // We are less forgiving than the strtoxxx() routines and do not
    704     // allow leading spaces.
    705     return "";
    706   }
    707 
    708   // See if the character right after the input text may potentially
    709   // look like a digit.
    710   if (isdigit(str[n]) ||
    711       ((str[n] >= 'a') && (str[n] <= 'f')) ||
    712       ((str[n] >= 'A') && (str[n] <= 'F'))) {
    713     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
    714     memcpy(buf, str, n);
    715     buf[n] = '\0';
    716     return buf;
    717   } else {
    718     // We can parse right out of the supplied string, so return it.
    719     return str;
    720   }
    721 }
    722 
    723 bool Arg::parse_long_radix(const char* str,
    724                            int n,
    725                            void* dest,
    726                            int radix) {
    727   if (n == 0) return false;
    728   char buf[kMaxNumberLength+1];
    729   str = TerminateNumber(buf, str, n);
    730   char* end;
    731   errno = 0;
    732   long r = strtol(str, &end, radix);
    733   if (end != str + n) return false;   // Leftover junk
    734   if (errno) return false;
    735   if (dest == NULL) return true;
    736   *(reinterpret_cast<long*>(dest)) = r;
    737   return true;
    738 }
    739 
    740 bool Arg::parse_ulong_radix(const char* str,
    741                             int n,
    742                             void* dest,
    743                             int radix) {
    744   if (n == 0) return false;
    745   char buf[kMaxNumberLength+1];
    746   str = TerminateNumber(buf, str, n);
    747   if (str[0] == '-') return false;    // strtoul() on a negative number?!
    748   char* end;
    749   errno = 0;
    750   unsigned long r = strtoul(str, &end, radix);
    751   if (end != str + n) return false;   // Leftover junk
    752   if (errno) return false;
    753   if (dest == NULL) return true;
    754   *(reinterpret_cast<unsigned long*>(dest)) = r;
    755   return true;
    756 }
    757 
    758 bool Arg::parse_short_radix(const char* str,
    759                             int n,
    760                             void* dest,
    761                             int radix) {
    762   long r;
    763   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    764   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
    765   if (dest == NULL) return true;
    766   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
    767   return true;
    768 }
    769 
    770 bool Arg::parse_ushort_radix(const char* str,
    771                              int n,
    772                              void* dest,
    773                              int radix) {
    774   unsigned long r;
    775   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    776   if (r > USHRT_MAX) return false;                      // Out of range
    777   if (dest == NULL) return true;
    778   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
    779   return true;
    780 }
    781 
    782 bool Arg::parse_int_radix(const char* str,
    783                           int n,
    784                           void* dest,
    785                           int radix) {
    786   long r;
    787   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    788   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
    789   if (dest == NULL) return true;
    790   *(reinterpret_cast<int*>(dest)) = r;
    791   return true;
    792 }
    793 
    794 bool Arg::parse_uint_radix(const char* str,
    795                            int n,
    796                            void* dest,
    797                            int radix) {
    798   unsigned long r;
    799   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    800   if (r > UINT_MAX) return false;                       // Out of range
    801   if (dest == NULL) return true;
    802   *(reinterpret_cast<unsigned int*>(dest)) = r;
    803   return true;
    804 }
    805 
    806 bool Arg::parse_longlong_radix(const char* str,
    807                                int n,
    808                                void* dest,
    809                                int radix) {
    810 #ifndef HAVE_LONG_LONG
    811   return false;
    812 #else
    813   if (n == 0) return false;
    814   char buf[kMaxNumberLength+1];
    815   str = TerminateNumber(buf, str, n);
    816   char* end;
    817   errno = 0;
    818 #if defined HAVE_STRTOQ
    819   long long r = strtoq(str, &end, radix);
    820 #elif defined HAVE_STRTOLL
    821   long long r = strtoll(str, &end, radix);
    822 #elif defined HAVE__STRTOI64
    823   long long r = _strtoi64(str, &end, radix);
    824 #elif defined HAVE_STRTOIMAX
    825   long long r = strtoimax(str, &end, radix);
    826 #else
    827 #error parse_longlong_radix: cannot convert input to a long-long
    828 #endif
    829   if (end != str + n) return false;   // Leftover junk
    830   if (errno) return false;
    831   if (dest == NULL) return true;
    832   *(reinterpret_cast<long long*>(dest)) = r;
    833   return true;
    834 #endif   /* HAVE_LONG_LONG */
    835 }
    836 
    837 bool Arg::parse_ulonglong_radix(const char* str,
    838                                 int n,
    839                                 void* dest,
    840                                 int radix) {
    841 #ifndef HAVE_UNSIGNED_LONG_LONG
    842   return false;
    843 #else
    844   if (n == 0) return false;
    845   char buf[kMaxNumberLength+1];
    846   str = TerminateNumber(buf, str, n);
    847   if (str[0] == '-') return false;    // strtoull() on a negative number?!
    848   char* end;
    849   errno = 0;
    850 #if defined HAVE_STRTOQ
    851   unsigned long long r = strtouq(str, &end, radix);
    852 #elif defined HAVE_STRTOLL
    853   unsigned long long r = strtoull(str, &end, radix);
    854 #elif defined HAVE__STRTOI64
    855   unsigned long long r = _strtoui64(str, &end, radix);
    856 #elif defined HAVE_STRTOIMAX
    857   unsigned long long r = strtoumax(str, &end, radix);
    858 #else
    859 #error parse_ulonglong_radix: cannot convert input to a long-long
    860 #endif
    861   if (end != str + n) return false;   // Leftover junk
    862   if (errno) return false;
    863   if (dest == NULL) return true;
    864   *(reinterpret_cast<unsigned long long*>(dest)) = r;
    865   return true;
    866 #endif   /* HAVE_UNSIGNED_LONG_LONG */
    867 }
    868 
    869 bool Arg::parse_double(const char* str, int n, void* dest) {
    870   if (n == 0) return false;
    871   static const int kMaxLength = 200;
    872   char buf[kMaxLength];
    873   if (n >= kMaxLength) return false;
    874   memcpy(buf, str, n);
    875   buf[n] = '\0';
    876   errno = 0;
    877   char* end;
    878   double r = strtod(buf, &end);
    879   if (end != buf + n) return false;   // Leftover junk
    880   if (errno) return false;
    881   if (dest == NULL) return true;
    882   *(reinterpret_cast<double*>(dest)) = r;
    883   return true;
    884 }
    885 
    886 bool Arg::parse_float(const char* str, int n, void* dest) {
    887   double r;
    888   if (!parse_double(str, n, &r)) return false;
    889   if (dest == NULL) return true;
    890   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
    891   return true;
    892 }
    893 
    894 
    895 #define DEFINE_INTEGER_PARSERS(name)                                    \
    896   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
    897     return parse_##name##_radix(str, n, dest, 10);                      \
    898   }                                                                     \
    899   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
    900     return parse_##name##_radix(str, n, dest, 16);                      \
    901   }                                                                     \
    902   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
    903     return parse_##name##_radix(str, n, dest, 8);                       \
    904   }                                                                     \
    905   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
    906     return parse_##name##_radix(str, n, dest, 0);                       \
    907   }
    908 
    909 DEFINE_INTEGER_PARSERS(short)      /*                                   */
    910 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
    911 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
    912 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
    913 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
    914 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
    915 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
    916 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
    917 
    918 #undef DEFINE_INTEGER_PARSERS
    919 
    920 }   // namespace pcrecpp
    921