Home | History | Annotate | Download | only in dist
      1 // Copyright (c) 2010, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 //
     30 // Author: Sanjay Ghemawat
     31 
     32 #ifdef HAVE_CONFIG_H
     33 #include "config.h"
     34 #endif
     35 
     36 #include <stdlib.h>
     37 #include <stdio.h>
     38 #include <ctype.h>
     39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
     40 #include <string.h>      /* for memcpy */
     41 #include <assert.h>
     42 #include <errno.h>
     43 #include <string>
     44 #include <algorithm>
     45 
     46 #include "pcrecpp_internal.h"
     47 #include "pcre.h"
     48 #include "pcrecpp.h"
     49 #include "pcre_stringpiece.h"
     50 
     51 
     52 namespace pcrecpp {
     53 
     54 // Maximum number of args we can set
     55 static const int kMaxArgs = 16;
     56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
     57 
     58 // Special object that stands-in for no argument
     59 Arg RE::no_arg((void*)NULL);
     60 
     61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
     62 // which defined a global no_arg variable instead of putting it in the
     63 // RE class.  This works on GCC >= 3, at least.  It definitely works
     64 // for ELF, but may not for other object formats (Mach-O, for
     65 // instance, does not support aliases.)  We could probably have a more
     66 // inclusive test if we ever needed it.  (Note that not only the
     67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
     68 // gnu-specific.)
     69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
     70 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
     71 # define ULP_AS_STRING_INTERNAL(x)   #x
     72 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
     73 extern Arg no_arg
     74   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
     75 #endif
     76 
     77 // If a regular expression has no error, its error_ field points here
     78 static const string empty_string;
     79 
     80 // If the user doesn't ask for any options, we just use this one
     81 static RE_Options default_options;
     82 
     83 void RE::Init(const string& pat, const RE_Options* options) {
     84   pattern_ = pat;
     85   if (options == NULL) {
     86     options_ = default_options;
     87   } else {
     88     options_ = *options;
     89   }
     90   error_ = &empty_string;
     91   re_full_ = NULL;
     92   re_partial_ = NULL;
     93 
     94   re_partial_ = Compile(UNANCHORED);
     95   if (re_partial_ != NULL) {
     96     re_full_ = Compile(ANCHOR_BOTH);
     97   }
     98 }
     99 
    100 void RE::Cleanup() {
    101   if (re_full_ != NULL)         (*pcre_free)(re_full_);
    102   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
    103   if (error_ != &empty_string)  delete error_;
    104 }
    105 
    106 
    107 RE::~RE() {
    108   Cleanup();
    109 }
    110 
    111 
    112 pcre* RE::Compile(Anchor anchor) {
    113   // First, convert RE_Options into pcre options
    114   int pcre_options = 0;
    115   pcre_options = options_.all_options();
    116 
    117   // Special treatment for anchoring.  This is needed because at
    118   // runtime pcre only provides an option for anchoring at the
    119   // beginning of a string (unless you use offset).
    120   //
    121   // There are three types of anchoring we want:
    122   //    UNANCHORED      Compile the original pattern, and use
    123   //                    a pcre unanchored match.
    124   //    ANCHOR_START    Compile the original pattern, and use
    125   //                    a pcre anchored match.
    126   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
    127   //                    and use a pcre anchored match.
    128 
    129   const char* compile_error;
    130   int eoffset;
    131   pcre* re;
    132   if (anchor != ANCHOR_BOTH) {
    133     re = pcre_compile(pattern_.c_str(), pcre_options,
    134                       &compile_error, &eoffset, NULL);
    135   } else {
    136     // Tack a '\z' at the end of RE.  Parenthesize it first so that
    137     // the '\z' applies to all top-level alternatives in the regexp.
    138     string wrapped = "(?:";  // A non-counting grouping operator
    139     wrapped += pattern_;
    140     wrapped += ")\\z";
    141     re = pcre_compile(wrapped.c_str(), pcre_options,
    142                       &compile_error, &eoffset, NULL);
    143   }
    144   if (re == NULL) {
    145     if (error_ == &empty_string) error_ = new string(compile_error);
    146   }
    147   return re;
    148 }
    149 
    150 /***** Matching interfaces *****/
    151 
    152 bool RE::FullMatch(const StringPiece& text,
    153                    const Arg& ptr1,
    154                    const Arg& ptr2,
    155                    const Arg& ptr3,
    156                    const Arg& ptr4,
    157                    const Arg& ptr5,
    158                    const Arg& ptr6,
    159                    const Arg& ptr7,
    160                    const Arg& ptr8,
    161                    const Arg& ptr9,
    162                    const Arg& ptr10,
    163                    const Arg& ptr11,
    164                    const Arg& ptr12,
    165                    const Arg& ptr13,
    166                    const Arg& ptr14,
    167                    const Arg& ptr15,
    168                    const Arg& ptr16) const {
    169   const Arg* args[kMaxArgs];
    170   int n = 0;
    171   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    172   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    173   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    174   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    175   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    176   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    177   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    178   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    179   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    180   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    181   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    182   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    183   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    184   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    185   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    186   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    187  done:
    188 
    189   int consumed;
    190   int vec[kVecSize];
    191   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
    192 }
    193 
    194 bool RE::PartialMatch(const StringPiece& text,
    195                       const Arg& ptr1,
    196                       const Arg& ptr2,
    197                       const Arg& ptr3,
    198                       const Arg& ptr4,
    199                       const Arg& ptr5,
    200                       const Arg& ptr6,
    201                       const Arg& ptr7,
    202                       const Arg& ptr8,
    203                       const Arg& ptr9,
    204                       const Arg& ptr10,
    205                       const Arg& ptr11,
    206                       const Arg& ptr12,
    207                       const Arg& ptr13,
    208                       const Arg& ptr14,
    209                       const Arg& ptr15,
    210                       const Arg& ptr16) const {
    211   const Arg* args[kMaxArgs];
    212   int n = 0;
    213   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    214   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    215   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    216   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    217   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    218   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    219   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    220   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    221   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    222   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    223   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    224   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    225   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    226   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    227   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    228   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    229  done:
    230 
    231   int consumed;
    232   int vec[kVecSize];
    233   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
    234 }
    235 
    236 bool RE::Consume(StringPiece* input,
    237                  const Arg& ptr1,
    238                  const Arg& ptr2,
    239                  const Arg& ptr3,
    240                  const Arg& ptr4,
    241                  const Arg& ptr5,
    242                  const Arg& ptr6,
    243                  const Arg& ptr7,
    244                  const Arg& ptr8,
    245                  const Arg& ptr9,
    246                  const Arg& ptr10,
    247                  const Arg& ptr11,
    248                  const Arg& ptr12,
    249                  const Arg& ptr13,
    250                  const Arg& ptr14,
    251                  const Arg& ptr15,
    252                  const Arg& ptr16) const {
    253   const Arg* args[kMaxArgs];
    254   int n = 0;
    255   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    256   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    257   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    258   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    259   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    260   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    261   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    262   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    263   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    264   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    265   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    266   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    267   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    268   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    269   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    270   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    271  done:
    272 
    273   int consumed;
    274   int vec[kVecSize];
    275   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
    276                   args, n, vec, kVecSize)) {
    277     input->remove_prefix(consumed);
    278     return true;
    279   } else {
    280     return false;
    281   }
    282 }
    283 
    284 bool RE::FindAndConsume(StringPiece* input,
    285                         const Arg& ptr1,
    286                         const Arg& ptr2,
    287                         const Arg& ptr3,
    288                         const Arg& ptr4,
    289                         const Arg& ptr5,
    290                         const Arg& ptr6,
    291                         const Arg& ptr7,
    292                         const Arg& ptr8,
    293                         const Arg& ptr9,
    294                         const Arg& ptr10,
    295                         const Arg& ptr11,
    296                         const Arg& ptr12,
    297                         const Arg& ptr13,
    298                         const Arg& ptr14,
    299                         const Arg& ptr15,
    300                         const Arg& ptr16) const {
    301   const Arg* args[kMaxArgs];
    302   int n = 0;
    303   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
    304   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
    305   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
    306   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
    307   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
    308   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
    309   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
    310   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
    311   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
    312   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
    313   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
    314   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
    315   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
    316   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
    317   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
    318   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
    319  done:
    320 
    321   int consumed;
    322   int vec[kVecSize];
    323   if (DoMatchImpl(*input, UNANCHORED, &consumed,
    324                   args, n, vec, kVecSize)) {
    325     input->remove_prefix(consumed);
    326     return true;
    327   } else {
    328     return false;
    329   }
    330 }
    331 
    332 bool RE::Replace(const StringPiece& rewrite,
    333                  string *str) const {
    334   int vec[kVecSize];
    335   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
    336   if (matches == 0)
    337     return false;
    338 
    339   string s;
    340   if (!Rewrite(&s, rewrite, *str, vec, matches))
    341     return false;
    342 
    343   assert(vec[0] >= 0);
    344   assert(vec[1] >= 0);
    345   str->replace(vec[0], vec[1] - vec[0], s);
    346   return true;
    347 }
    348 
    349 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
    350 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
    351 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
    352 
    353 static int NewlineMode(int pcre_options) {
    354   // TODO: if we can make it threadsafe, cache this var
    355   int newline_mode = 0;
    356   /* if (newline_mode) return newline_mode; */  // do this once it's cached
    357   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
    358                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
    359     newline_mode = (pcre_options &
    360                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
    361                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
    362   } else {
    363     int newline;
    364     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
    365     if (newline == 10)
    366       newline_mode = PCRE_NEWLINE_LF;
    367     else if (newline == 13)
    368       newline_mode = PCRE_NEWLINE_CR;
    369     else if (newline == 3338)
    370       newline_mode = PCRE_NEWLINE_CRLF;
    371     else if (newline == -1)
    372       newline_mode = PCRE_NEWLINE_ANY;
    373     else if (newline == -2)
    374       newline_mode = PCRE_NEWLINE_ANYCRLF;
    375     else
    376       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
    377   }
    378   return newline_mode;
    379 }
    380 
    381 int RE::GlobalReplace(const StringPiece& rewrite,
    382                       string *str) const {
    383   int count = 0;
    384   int vec[kVecSize];
    385   string out;
    386   int start = 0;
    387   bool last_match_was_empty_string = false;
    388 
    389   while (start <= static_cast<int>(str->length())) {
    390     // If the previous match was for the empty string, we shouldn't
    391     // just match again: we'll match in the same way and get an
    392     // infinite loop.  Instead, we do the match in a special way:
    393     // anchored -- to force another try at the same position --
    394     // and with a flag saying that this time, ignore empty matches.
    395     // If this special match returns, that means there's a non-empty
    396     // match at this position as well, and we can continue.  If not,
    397     // we do what perl does, and just advance by one.
    398     // Notice that perl prints '@@@' for this;
    399     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
    400     int matches;
    401     if (last_match_was_empty_string) {
    402       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
    403       if (matches <= 0) {
    404         int matchend = start + 1;     // advance one character.
    405         // If the current char is CR and we're in CRLF mode, skip LF too.
    406         // Note it's better to call pcre_fullinfo() than to examine
    407         // all_options(), since options_ could have changed bewteen
    408         // compile-time and now, but this is simpler and safe enough.
    409         // Modified by PH to add ANY and ANYCRLF.
    410         if (matchend < static_cast<int>(str->length()) &&
    411             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
    412             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
    413              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
    414              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
    415           matchend++;
    416         }
    417         // We also need to advance more than one char if we're in utf8 mode.
    418 #ifdef SUPPORT_UTF8
    419         if (options_.utf8()) {
    420           while (matchend < static_cast<int>(str->length()) &&
    421                  ((*str)[matchend] & 0xc0) == 0x80)
    422             matchend++;
    423         }
    424 #endif
    425         if (start < static_cast<int>(str->length()))
    426           out.append(*str, start, matchend - start);
    427         start = matchend;
    428         last_match_was_empty_string = false;
    429         continue;
    430       }
    431     } else {
    432       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
    433       if (matches <= 0)
    434         break;
    435     }
    436     int matchstart = vec[0], matchend = vec[1];
    437     assert(matchstart >= start);
    438     assert(matchend >= matchstart);
    439     out.append(*str, start, matchstart - start);
    440     Rewrite(&out, rewrite, *str, vec, matches);
    441     start = matchend;
    442     count++;
    443     last_match_was_empty_string = (matchstart == matchend);
    444   }
    445 
    446   if (count == 0)
    447     return 0;
    448 
    449   if (start < static_cast<int>(str->length()))
    450     out.append(*str, start, str->length() - start);
    451   swap(out, *str);
    452   return count;
    453 }
    454 
    455 bool RE::Extract(const StringPiece& rewrite,
    456                  const StringPiece& text,
    457                  string *out) const {
    458   int vec[kVecSize];
    459   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
    460   if (matches == 0)
    461     return false;
    462   out->erase();
    463   return Rewrite(out, rewrite, text, vec, matches);
    464 }
    465 
    466 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
    467   string result;
    468 
    469   // Escape any ascii character not in [A-Za-z_0-9].
    470   //
    471   // Note that it's legal to escape a character even if it has no
    472   // special meaning in a regular expression -- so this function does
    473   // that.  (This also makes it identical to the perl function of the
    474   // same name; see `perldoc -f quotemeta`.)  The one exception is
    475   // escaping NUL: rather than doing backslash + NUL, like perl does,
    476   // we do '\0', because pcre itself doesn't take embedded NUL chars.
    477   for (int ii = 0; ii < unquoted.size(); ++ii) {
    478     // Note that using 'isalnum' here raises the benchmark time from
    479     // 32ns to 58ns:
    480     if (unquoted[ii] == '\0') {
    481       result += "\\0";
    482     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    483                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    484                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    485                unquoted[ii] != '_' &&
    486                // If this is the part of a UTF8 or Latin1 character, we need
    487                // to copy this byte without escaping.  Experimentally this is
    488                // what works correctly with the regexp library.
    489                !(unquoted[ii] & 128)) {
    490       result += '\\';
    491       result += unquoted[ii];
    492     } else {
    493       result += unquoted[ii];
    494     }
    495   }
    496 
    497   return result;
    498 }
    499 
    500 /***** Actual matching and rewriting code *****/
    501 
    502 int RE::TryMatch(const StringPiece& text,
    503                  int startpos,
    504                  Anchor anchor,
    505                  bool empty_ok,
    506                  int *vec,
    507                  int vecsize) const {
    508   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
    509   if (re == NULL) {
    510     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
    511     return 0;
    512   }
    513 
    514   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
    515   if (options_.match_limit() > 0) {
    516     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
    517     extra.match_limit = options_.match_limit();
    518   }
    519   if (options_.match_limit_recursion() > 0) {
    520     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
    521     extra.match_limit_recursion = options_.match_limit_recursion();
    522   }
    523 
    524   // int options = 0;
    525   // Changed by PH as a result of bugzilla #1288
    526   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
    527 
    528   if (anchor != UNANCHORED)
    529     options |= PCRE_ANCHORED;
    530   if (!empty_ok)
    531     options |= PCRE_NOTEMPTY;
    532 
    533   int rc = pcre_exec(re,              // The regular expression object
    534                      &extra,
    535                      (text.data() == NULL) ? "" : text.data(),
    536                      text.size(),
    537                      startpos,
    538                      options,
    539                      vec,
    540                      vecsize);
    541 
    542   // Handle errors
    543   if (rc == PCRE_ERROR_NOMATCH) {
    544     return 0;
    545   } else if (rc < 0) {
    546     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
    547     //        re, pattern_.c_str());
    548     return 0;
    549   } else if (rc == 0) {
    550     // pcre_exec() returns 0 as a special case when the number of
    551     // capturing subpatterns exceeds the size of the vector.
    552     // When this happens, there is a match and the output vector
    553     // is filled, but we miss out on the positions of the extra subpatterns.
    554     rc = vecsize / 2;
    555   }
    556 
    557   return rc;
    558 }
    559 
    560 bool RE::DoMatchImpl(const StringPiece& text,
    561                      Anchor anchor,
    562                      int* consumed,
    563                      const Arg* const* args,
    564                      int n,
    565                      int* vec,
    566                      int vecsize) const {
    567   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
    568   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
    569   assert(matches >= 0);  // TryMatch never returns negatives
    570   if (matches == 0)
    571     return false;
    572 
    573   *consumed = vec[1];
    574 
    575   if (n == 0 || args == NULL) {
    576     // We are not interested in results
    577     return true;
    578   }
    579 
    580   if (NumberOfCapturingGroups() < n) {
    581     // RE has fewer capturing groups than number of arg pointers passed in
    582     return false;
    583   }
    584 
    585   // If we got here, we must have matched the whole pattern.
    586   // We do not need (can not do) any more checks on the value of 'matches' here
    587   // -- see the comment for TryMatch.
    588   for (int i = 0; i < n; i++) {
    589     const int start = vec[2*(i+1)];
    590     const int limit = vec[2*(i+1)+1];
    591     if (!args[i]->Parse(text.data() + start, limit-start)) {
    592       // TODO: Should we indicate what the error was?
    593       return false;
    594     }
    595   }
    596 
    597   return true;
    598 }
    599 
    600 bool RE::DoMatch(const StringPiece& text,
    601                  Anchor anchor,
    602                  int* consumed,
    603                  const Arg* const args[],
    604                  int n) const {
    605   assert(n >= 0);
    606   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
    607                                        // (as for kVecSize)
    608   int space[21];   // use stack allocation for small vecsize (common case)
    609   int* vec = vecsize <= 21 ? space : new int[vecsize];
    610   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
    611   if (vec != space) delete [] vec;
    612   return retval;
    613 }
    614 
    615 bool RE::Rewrite(string *out, const StringPiece &rewrite,
    616                  const StringPiece &text, int *vec, int veclen) const {
    617   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    618        s < end; s++) {
    619     int c = *s;
    620     if (c == '\\') {
    621       c = *++s;
    622       if (isdigit(c)) {
    623         int n = (c - '0');
    624         if (n >= veclen) {
    625           //fprintf(stderr, requested group %d in regexp %.*s\n",
    626           //        n, rewrite.size(), rewrite.data());
    627           return false;
    628         }
    629         int start = vec[2 * n];
    630         if (start >= 0)
    631           out->append(text.data() + start, vec[2 * n + 1] - start);
    632       } else if (c == '\\') {
    633         *out += '\\';
    634       } else {
    635         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
    636         //        rewrite.size(), rewrite.data());
    637         return false;
    638       }
    639     } else {
    640       *out += c;
    641     }
    642   }
    643   return true;
    644 }
    645 
    646 // Return the number of capturing subpatterns, or -1 if the
    647 // regexp wasn't valid on construction.
    648 int RE::NumberOfCapturingGroups() const {
    649   if (re_partial_ == NULL) return -1;
    650 
    651   int result;
    652   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
    653                                   NULL,         // We did not study the pattern
    654                                   PCRE_INFO_CAPTURECOUNT,
    655                                   &result);
    656   assert(pcre_retval == 0);
    657   return result;
    658 }
    659 
    660 /***** Parsers for various types *****/
    661 
    662 bool Arg::parse_null(const char* str, int n, void* dest) {
    663   (void)str;
    664   (void)n;
    665   // We fail if somebody asked us to store into a non-NULL void* pointer
    666   return (dest == NULL);
    667 }
    668 
    669 bool Arg::parse_string(const char* str, int n, void* dest) {
    670   if (dest == NULL) return true;
    671   reinterpret_cast<string*>(dest)->assign(str, n);
    672   return true;
    673 }
    674 
    675 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
    676   if (dest == NULL) return true;
    677   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    678   return true;
    679 }
    680 
    681 bool Arg::parse_char(const char* str, int n, void* dest) {
    682   if (n != 1) return false;
    683   if (dest == NULL) return true;
    684   *(reinterpret_cast<char*>(dest)) = str[0];
    685   return true;
    686 }
    687 
    688 bool Arg::parse_uchar(const char* str, int n, void* dest) {
    689   if (n != 1) return false;
    690   if (dest == NULL) return true;
    691   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    692   return true;
    693 }
    694 
    695 // Largest number spec that we are willing to parse
    696 static const int kMaxNumberLength = 32;
    697 
    698 // REQUIRES "buf" must have length at least kMaxNumberLength+1
    699 // REQUIRES "n > 0"
    700 // Copies "str" into "buf" and null-terminates if necessary.
    701 // Returns one of:
    702 //      a. "str" if no termination is needed
    703 //      b. "buf" if the string was copied and null-terminated
    704 //      c. "" if the input was invalid and has no hope of being parsed
    705 static const char* TerminateNumber(char* buf, const char* str, int n) {
    706   if ((n > 0) && isspace(*str)) {
    707     // We are less forgiving than the strtoxxx() routines and do not
    708     // allow leading spaces.
    709     return "";
    710   }
    711 
    712   // See if the character right after the input text may potentially
    713   // look like a digit.
    714   if (isdigit(str[n]) ||
    715       ((str[n] >= 'a') && (str[n] <= 'f')) ||
    716       ((str[n] >= 'A') && (str[n] <= 'F'))) {
    717     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
    718     memcpy(buf, str, n);
    719     buf[n] = '\0';
    720     return buf;
    721   } else {
    722     // We can parse right out of the supplied string, so return it.
    723     return str;
    724   }
    725 }
    726 
    727 bool Arg::parse_long_radix(const char* str,
    728                            int n,
    729                            void* dest,
    730                            int radix) {
    731   if (n == 0) return false;
    732   char buf[kMaxNumberLength+1];
    733   str = TerminateNumber(buf, str, n);
    734   char* end;
    735   errno = 0;
    736   long r = strtol(str, &end, radix);
    737   if (end != str + n) return false;   // Leftover junk
    738   if (errno) return false;
    739   if (dest == NULL) return true;
    740   *(reinterpret_cast<long*>(dest)) = r;
    741   return true;
    742 }
    743 
    744 bool Arg::parse_ulong_radix(const char* str,
    745                             int n,
    746                             void* dest,
    747                             int radix) {
    748   if (n == 0) return false;
    749   char buf[kMaxNumberLength+1];
    750   str = TerminateNumber(buf, str, n);
    751   if (str[0] == '-') return false;    // strtoul() on a negative number?!
    752   char* end;
    753   errno = 0;
    754   unsigned long r = strtoul(str, &end, radix);
    755   if (end != str + n) return false;   // Leftover junk
    756   if (errno) return false;
    757   if (dest == NULL) return true;
    758   *(reinterpret_cast<unsigned long*>(dest)) = r;
    759   return true;
    760 }
    761 
    762 bool Arg::parse_short_radix(const char* str,
    763                             int n,
    764                             void* dest,
    765                             int radix) {
    766   long r;
    767   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    768   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
    769   if (dest == NULL) return true;
    770   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
    771   return true;
    772 }
    773 
    774 bool Arg::parse_ushort_radix(const char* str,
    775                              int n,
    776                              void* dest,
    777                              int radix) {
    778   unsigned long r;
    779   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    780   if (r > USHRT_MAX) return false;                      // Out of range
    781   if (dest == NULL) return true;
    782   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
    783   return true;
    784 }
    785 
    786 bool Arg::parse_int_radix(const char* str,
    787                           int n,
    788                           void* dest,
    789                           int radix) {
    790   long r;
    791   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    792   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
    793   if (dest == NULL) return true;
    794   *(reinterpret_cast<int*>(dest)) = r;
    795   return true;
    796 }
    797 
    798 bool Arg::parse_uint_radix(const char* str,
    799                            int n,
    800                            void* dest,
    801                            int radix) {
    802   unsigned long r;
    803   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    804   if (r > UINT_MAX) return false;                       // Out of range
    805   if (dest == NULL) return true;
    806   *(reinterpret_cast<unsigned int*>(dest)) = r;
    807   return true;
    808 }
    809 
    810 bool Arg::parse_longlong_radix(const char* str,
    811                                int n,
    812                                void* dest,
    813                                int radix) {
    814 #ifndef HAVE_LONG_LONG
    815   return false;
    816 #else
    817   if (n == 0) return false;
    818   char buf[kMaxNumberLength+1];
    819   str = TerminateNumber(buf, str, n);
    820   char* end;
    821   errno = 0;
    822 #if defined HAVE_STRTOQ
    823   long long r = strtoq(str, &end, radix);
    824 #elif defined HAVE_STRTOLL
    825   long long r = strtoll(str, &end, radix);
    826 #elif defined HAVE__STRTOI64
    827   long long r = _strtoi64(str, &end, radix);
    828 #elif defined HAVE_STRTOIMAX
    829   long long r = strtoimax(str, &end, radix);
    830 #else
    831 #error parse_longlong_radix: cannot convert input to a long-long
    832 #endif
    833   if (end != str + n) return false;   // Leftover junk
    834   if (errno) return false;
    835   if (dest == NULL) return true;
    836   *(reinterpret_cast<long long*>(dest)) = r;
    837   return true;
    838 #endif   /* HAVE_LONG_LONG */
    839 }
    840 
    841 bool Arg::parse_ulonglong_radix(const char* str,
    842                                 int n,
    843                                 void* dest,
    844                                 int radix) {
    845 #ifndef HAVE_UNSIGNED_LONG_LONG
    846   return false;
    847 #else
    848   if (n == 0) return false;
    849   char buf[kMaxNumberLength+1];
    850   str = TerminateNumber(buf, str, n);
    851   if (str[0] == '-') return false;    // strtoull() on a negative number?!
    852   char* end;
    853   errno = 0;
    854 #if defined HAVE_STRTOQ
    855   unsigned long long r = strtouq(str, &end, radix);
    856 #elif defined HAVE_STRTOLL
    857   unsigned long long r = strtoull(str, &end, radix);
    858 #elif defined HAVE__STRTOI64
    859   unsigned long long r = _strtoui64(str, &end, radix);
    860 #elif defined HAVE_STRTOIMAX
    861   unsigned long long r = strtoumax(str, &end, radix);
    862 #else
    863 #error parse_ulonglong_radix: cannot convert input to a long-long
    864 #endif
    865   if (end != str + n) return false;   // Leftover junk
    866   if (errno) return false;
    867   if (dest == NULL) return true;
    868   *(reinterpret_cast<unsigned long long*>(dest)) = r;
    869   return true;
    870 #endif   /* HAVE_UNSIGNED_LONG_LONG */
    871 }
    872 
    873 bool Arg::parse_double(const char* str, int n, void* dest) {
    874   if (n == 0) return false;
    875   static const int kMaxLength = 200;
    876   char buf[kMaxLength];
    877   if (n >= kMaxLength) return false;
    878   memcpy(buf, str, n);
    879   buf[n] = '\0';
    880   errno = 0;
    881   char* end;
    882   double r = strtod(buf, &end);
    883   if (end != buf + n) return false;   // Leftover junk
    884   if (errno) return false;
    885   if (dest == NULL) return true;
    886   *(reinterpret_cast<double*>(dest)) = r;
    887   return true;
    888 }
    889 
    890 bool Arg::parse_float(const char* str, int n, void* dest) {
    891   double r;
    892   if (!parse_double(str, n, &r)) return false;
    893   if (dest == NULL) return true;
    894   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
    895   return true;
    896 }
    897 
    898 
    899 #define DEFINE_INTEGER_PARSERS(name)                                    \
    900   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
    901     return parse_##name##_radix(str, n, dest, 10);                      \
    902   }                                                                     \
    903   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
    904     return parse_##name##_radix(str, n, dest, 16);                      \
    905   }                                                                     \
    906   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
    907     return parse_##name##_radix(str, n, dest, 8);                       \
    908   }                                                                     \
    909   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
    910     return parse_##name##_radix(str, n, dest, 0);                       \
    911   }
    912 
    913 DEFINE_INTEGER_PARSERS(short)      /*                                   */
    914 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
    915 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
    916 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
    917 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
    918 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
    919 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
    920 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
    921 
    922 #undef DEFINE_INTEGER_PARSERS
    923 
    924 }   // namespace pcrecpp
    925