Home | History | Annotate | Download | only in dist
      1 // Copyright (c) 2005, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 //
     30 // Author: Sanjay Ghemawat
     31 // Support for PCRE_XXX modifiers added by Giuseppe Maxia, July 2005
     32 
     33 #ifndef _PCRECPP_H
     34 #define _PCRECPP_H
     35 
     36 // C++ interface to the pcre regular-expression library.  RE supports
     37 // Perl-style regular expressions (with extensions like \d, \w, \s,
     38 // ...).
     39 //
     40 // -----------------------------------------------------------------------
     41 // REGEXP SYNTAX:
     42 //
     43 // This module is part of the pcre library and hence supports its syntax
     44 // for regular expressions.
     45 //
     46 // The syntax is pretty similar to Perl's.  For those not familiar
     47 // with Perl's regular expressions, here are some examples of the most
     48 // commonly used extensions:
     49 //
     50 //   "hello (\\w+) world"  -- \w matches a "word" character
     51 //   "version (\\d+)"      -- \d matches a digit
     52 //   "hello\\s+world"      -- \s matches any whitespace character
     53 //   "\\b(\\w+)\\b"        -- \b matches empty string at a word boundary
     54 //   "(?i)hello"           -- (?i) turns on case-insensitive matching
     55 //   "/\\*(.*?)\\*/"       -- .*? matches . minimum no. of times possible
     56 //
     57 // -----------------------------------------------------------------------
     58 // MATCHING INTERFACE:
     59 //
     60 // The "FullMatch" operation checks that supplied text matches a
     61 // supplied pattern exactly.
     62 //
     63 // Example: successful match
     64 //    pcrecpp::RE re("h.*o");
     65 //    re.FullMatch("hello");
     66 //
     67 // Example: unsuccessful match (requires full match):
     68 //    pcrecpp::RE re("e");
     69 //    !re.FullMatch("hello");
     70 //
     71 // Example: creating a temporary RE object:
     72 //    pcrecpp::RE("h.*o").FullMatch("hello");
     73 //
     74 // You can pass in a "const char*" or a "string" for "text".  The
     75 // examples below tend to use a const char*.
     76 //
     77 // You can, as in the different examples above, store the RE object
     78 // explicitly in a variable or use a temporary RE object.  The
     79 // examples below use one mode or the other arbitrarily.  Either
     80 // could correctly be used for any of these examples.
     81 //
     82 // -----------------------------------------------------------------------
     83 // MATCHING WITH SUB-STRING EXTRACTION:
     84 //
     85 // You can supply extra pointer arguments to extract matched subpieces.
     86 //
     87 // Example: extracts "ruby" into "s" and 1234 into "i"
     88 //    int i;
     89 //    string s;
     90 //    pcrecpp::RE re("(\\w+):(\\d+)");
     91 //    re.FullMatch("ruby:1234", &s, &i);
     92 //
     93 // Example: does not try to extract any extra sub-patterns
     94 //    re.FullMatch("ruby:1234", &s);
     95 //
     96 // Example: does not try to extract into NULL
     97 //    re.FullMatch("ruby:1234", NULL, &i);
     98 //
     99 // Example: integer overflow causes failure
    100 //    !re.FullMatch("ruby:1234567891234", NULL, &i);
    101 //
    102 // Example: fails because there aren't enough sub-patterns:
    103 //    !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
    104 //
    105 // Example: fails because string cannot be stored in integer
    106 //    !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
    107 //
    108 // The provided pointer arguments can be pointers to any scalar numeric
    109 // type, or one of
    110 //    string        (matched piece is copied to string)
    111 //    StringPiece   (StringPiece is mutated to point to matched piece)
    112 //    T             (where "bool T::ParseFrom(const char*, int)" exists)
    113 //    NULL          (the corresponding matched sub-pattern is not copied)
    114 //
    115 // CAVEAT: An optional sub-pattern that does not exist in the matched
    116 // string is assigned the empty string.  Therefore, the following will
    117 // return false (because the empty string is not a valid number):
    118 //    int number;
    119 //    pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
    120 //
    121 // -----------------------------------------------------------------------
    122 // DO_MATCH
    123 //
    124 // The matching interface supports at most 16 arguments per call.
    125 // If you need more, consider using the more general interface
    126 // pcrecpp::RE::DoMatch().  See pcrecpp.h for the signature for DoMatch.
    127 //
    128 // -----------------------------------------------------------------------
    129 // PARTIAL MATCHES
    130 //
    131 // You can use the "PartialMatch" operation when you want the pattern
    132 // to match any substring of the text.
    133 //
    134 // Example: simple search for a string:
    135 //    pcrecpp::RE("ell").PartialMatch("hello");
    136 //
    137 // Example: find first number in a string:
    138 //    int number;
    139 //    pcrecpp::RE re("(\\d+)");
    140 //    re.PartialMatch("x*100 + 20", &number);
    141 //    assert(number == 100);
    142 //
    143 // -----------------------------------------------------------------------
    144 // UTF-8 AND THE MATCHING INTERFACE:
    145 //
    146 // By default, pattern and text are plain text, one byte per character.
    147 // The UTF8 flag, passed to the constructor, causes both pattern
    148 // and string to be treated as UTF-8 text, still a byte stream but
    149 // potentially multiple bytes per character. In practice, the text
    150 // is likelier to be UTF-8 than the pattern, but the match returned
    151 // may depend on the UTF8 flag, so always use it when matching
    152 // UTF8 text.  E.g., "." will match one byte normally but with UTF8
    153 // set may match up to three bytes of a multi-byte character.
    154 //
    155 // Example:
    156 //    pcrecpp::RE_Options options;
    157 //    options.set_utf8();
    158 //    pcrecpp::RE re(utf8_pattern, options);
    159 //    re.FullMatch(utf8_string);
    160 //
    161 // Example: using the convenience function UTF8():
    162 //    pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
    163 //    re.FullMatch(utf8_string);
    164 //
    165 // NOTE: The UTF8 option is ignored if pcre was not configured with the
    166 //       --enable-utf8 flag.
    167 //
    168 // -----------------------------------------------------------------------
    169 // PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
    170 //
    171 // PCRE defines some modifiers to change the behavior of the regular
    172 // expression engine.
    173 // The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle
    174 // to pass such modifiers to a RE class.
    175 //
    176 // Currently, the following modifiers are supported
    177 //
    178 //    modifier              description               Perl corresponding
    179 //
    180 //    PCRE_CASELESS         case insensitive match    /i
    181 //    PCRE_MULTILINE        multiple lines match      /m
    182 //    PCRE_DOTALL           dot matches newlines      /s
    183 //    PCRE_DOLLAR_ENDONLY   $ matches only at end     N/A
    184 //    PCRE_EXTRA            strict escape parsing     N/A
    185 //    PCRE_EXTENDED         ignore whitespaces        /x
    186 //    PCRE_UTF8             handles UTF8 chars        built-in
    187 //    PCRE_UNGREEDY         reverses * and *?         N/A
    188 //    PCRE_NO_AUTO_CAPTURE  disables matching parens  N/A (*)
    189 //
    190 // (For a full account on how each modifier works, please check the
    191 // PCRE API reference manual).
    192 //
    193 // (*) Both Perl and PCRE allow non matching parentheses by means of the
    194 // "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
    195 // capture, while (ab|cd) does.
    196 //
    197 // For each modifier, there are two member functions whose name is made
    198 // out of the modifier in lowercase, without the "PCRE_" prefix. For
    199 // instance, PCRE_CASELESS is handled by
    200 //    bool caseless(),
    201 // which returns true if the modifier is set, and
    202 //    RE_Options & set_caseless(bool),
    203 // which sets or unsets the modifier.
    204 //
    205 // Moreover, PCRE_EXTRA_MATCH_LIMIT can be accessed through the
    206 // set_match_limit() and match_limit() member functions.
    207 // Setting match_limit to a non-zero value will limit the executation of
    208 // pcre to keep it from doing bad things like blowing the stack or taking
    209 // an eternity to return a result.  A value of 5000 is good enough to stop
    210 // stack blowup in a 2MB thread stack.  Setting match_limit to zero will
    211 // disable match limiting.  Alternately, you can set match_limit_recursion()
    212 // which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much pcre
    213 // recurses.  match_limit() caps the number of matches pcre does;
    214 // match_limit_recrusion() caps the depth of recursion.
    215 //
    216 // Normally, to pass one or more modifiers to a RE class, you declare
    217 // a RE_Options object, set the appropriate options, and pass this
    218 // object to a RE constructor. Example:
    219 //
    220 //    RE_options opt;
    221 //    opt.set_caseless(true);
    222 //
    223 //    if (RE("HELLO", opt).PartialMatch("hello world")) ...
    224 //
    225 // RE_options has two constructors. The default constructor takes no
    226 // arguments and creates a set of flags that are off by default.
    227 //
    228 // The optional parameter 'option_flags' is to facilitate transfer
    229 // of legacy code from C programs.  This lets you do
    230 //    RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
    231 //
    232 // But new code is better off doing
    233 //    RE(pattern,
    234 //      RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str);
    235 // (See below)
    236 //
    237 // If you are going to pass one of the most used modifiers, there are some
    238 // convenience functions that return a RE_Options class with the
    239 // appropriate modifier already set:
    240 // CASELESS(), UTF8(), MULTILINE(), DOTALL(), EXTENDED()
    241 //
    242 // If you need to set several options at once, and you don't want to go
    243 // through the pains of declaring a RE_Options object and setting several
    244 // options, there is a parallel method that give you such ability on the
    245 // fly. You can concatenate several set_xxxxx member functions, since each
    246 // of them returns a reference to its class object.  e.g.: to pass
    247 // PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one
    248 // statement, you may write
    249 //
    250 //    RE(" ^ xyz \\s+ .* blah$", RE_Options()
    251 //                            .set_caseless(true)
    252 //                            .set_extended(true)
    253 //                            .set_multiline(true)).PartialMatch(sometext);
    254 //
    255 // -----------------------------------------------------------------------
    256 // SCANNING TEXT INCREMENTALLY
    257 //
    258 // The "Consume" operation may be useful if you want to repeatedly
    259 // match regular expressions at the front of a string and skip over
    260 // them as they match.  This requires use of the "StringPiece" type,
    261 // which represents a sub-range of a real string.  Like RE, StringPiece
    262 // is defined in the pcrecpp namespace.
    263 //
    264 // Example: read lines of the form "var = value" from a string.
    265 //    string contents = ...;                 // Fill string somehow
    266 //    pcrecpp::StringPiece input(contents);  // Wrap in a StringPiece
    267 //
    268 //    string var;
    269 //    int value;
    270 //    pcrecpp::RE re("(\\w+) = (\\d+)\n");
    271 //    while (re.Consume(&input, &var, &value)) {
    272 //      ...;
    273 //    }
    274 //
    275 // Each successful call to "Consume" will set "var/value", and also
    276 // advance "input" so it points past the matched text.
    277 //
    278 // The "FindAndConsume" operation is similar to "Consume" but does not
    279 // anchor your match at the beginning of the string.  For example, you
    280 // could extract all words from a string by repeatedly calling
    281 //     pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
    282 //
    283 // -----------------------------------------------------------------------
    284 // PARSING HEX/OCTAL/C-RADIX NUMBERS
    285 //
    286 // By default, if you pass a pointer to a numeric value, the
    287 // corresponding text is interpreted as a base-10 number.  You can
    288 // instead wrap the pointer with a call to one of the operators Hex(),
    289 // Octal(), or CRadix() to interpret the text in another base.  The
    290 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
    291 // prefixes, but defaults to base-10.
    292 //
    293 // Example:
    294 //   int a, b, c, d;
    295 //   pcrecpp::RE re("(.*) (.*) (.*) (.*)");
    296 //   re.FullMatch("100 40 0100 0x40",
    297 //                pcrecpp::Octal(&a), pcrecpp::Hex(&b),
    298 //                pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
    299 // will leave 64 in a, b, c, and d.
    300 //
    301 // -----------------------------------------------------------------------
    302 // REPLACING PARTS OF STRINGS
    303 //
    304 // You can replace the first match of "pattern" in "str" with
    305 // "rewrite".  Within "rewrite", backslash-escaped digits (\1 to \9)
    306 // can be used to insert text matching corresponding parenthesized
    307 // group from the pattern.  \0 in "rewrite" refers to the entire
    308 // matching text.  E.g.,
    309 //
    310 //   string s = "yabba dabba doo";
    311 //   pcrecpp::RE("b+").Replace("d", &s);
    312 //
    313 // will leave "s" containing "yada dabba doo".  The result is true if
    314 // the pattern matches and a replacement occurs, or false otherwise.
    315 //
    316 // GlobalReplace() is like Replace(), except that it replaces all
    317 // occurrences of the pattern in the string with the rewrite.
    318 // Replacements are not subject to re-matching.  E.g.,
    319 //
    320 //   string s = "yabba dabba doo";
    321 //   pcrecpp::RE("b+").GlobalReplace("d", &s);
    322 //
    323 // will leave "s" containing "yada dada doo".  It returns the number
    324 // of replacements made.
    325 //
    326 // Extract() is like Replace(), except that if the pattern matches,
    327 // "rewrite" is copied into "out" (an additional argument) with
    328 // substitutions.  The non-matching portions of "text" are ignored.
    329 // Returns true iff a match occurred and the extraction happened
    330 // successfully.  If no match occurs, the string is left unaffected.
    331 
    332 
    333 #include <string>
    334 #include <pcre.h>
    335 #include <pcrecpparg.h>   // defines the Arg class
    336 // This isn't technically needed here, but we include it
    337 // anyway so folks who include pcrecpp.h don't have to.
    338 #include <pcre_stringpiece.h>
    339 
    340 namespace pcrecpp {
    341 
    342 #define PCRE_SET_OR_CLEAR(b, o) \
    343     if (b) all_options_ |= (o); else all_options_ &= ~(o); \
    344     return *this
    345 
    346 #define PCRE_IS_SET(o)  \
    347         (all_options_ & o) == o
    348 
    349 /***** Compiling regular expressions: the RE class *****/
    350 
    351 // RE_Options allow you to set options to be passed along to pcre,
    352 // along with other options we put on top of pcre.
    353 // Only 9 modifiers, plus match_limit and match_limit_recursion,
    354 // are supported now.
    355 class PCRECPP_EXP_DEFN RE_Options {
    356  public:
    357   // constructor
    358   RE_Options() : match_limit_(0), match_limit_recursion_(0), all_options_(0) {}
    359 
    360   // alternative constructor.
    361   // To facilitate transfer of legacy code from C programs
    362   //
    363   // This lets you do
    364   //    RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
    365   // But new code is better off doing
    366   //    RE(pattern,
    367   //      RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str);
    368   RE_Options(int option_flags) : match_limit_(0), match_limit_recursion_(0),
    369                                  all_options_(option_flags) {}
    370   // we're fine with the default destructor, copy constructor, etc.
    371 
    372   // accessors and mutators
    373   int match_limit() const { return match_limit_; };
    374   RE_Options &set_match_limit(int limit) {
    375     match_limit_ = limit;
    376     return *this;
    377   }
    378 
    379   int match_limit_recursion() const { return match_limit_recursion_; };
    380   RE_Options &set_match_limit_recursion(int limit) {
    381     match_limit_recursion_ = limit;
    382     return *this;
    383   }
    384 
    385   bool caseless() const {
    386     return PCRE_IS_SET(PCRE_CASELESS);
    387   }
    388   RE_Options &set_caseless(bool x) {
    389     PCRE_SET_OR_CLEAR(x, PCRE_CASELESS);
    390   }
    391 
    392   bool multiline() const {
    393     return PCRE_IS_SET(PCRE_MULTILINE);
    394   }
    395   RE_Options &set_multiline(bool x) {
    396     PCRE_SET_OR_CLEAR(x, PCRE_MULTILINE);
    397   }
    398 
    399   bool dotall() const {
    400     return PCRE_IS_SET(PCRE_DOTALL);
    401   }
    402   RE_Options &set_dotall(bool x) {
    403     PCRE_SET_OR_CLEAR(x, PCRE_DOTALL);
    404   }
    405 
    406   bool extended() const {
    407     return PCRE_IS_SET(PCRE_EXTENDED);
    408   }
    409   RE_Options &set_extended(bool x) {
    410     PCRE_SET_OR_CLEAR(x, PCRE_EXTENDED);
    411   }
    412 
    413   bool dollar_endonly() const {
    414     return PCRE_IS_SET(PCRE_DOLLAR_ENDONLY);
    415   }
    416   RE_Options &set_dollar_endonly(bool x) {
    417     PCRE_SET_OR_CLEAR(x, PCRE_DOLLAR_ENDONLY);
    418   }
    419 
    420   bool extra() const {
    421     return PCRE_IS_SET(PCRE_EXTRA);
    422   }
    423   RE_Options &set_extra(bool x) {
    424     PCRE_SET_OR_CLEAR(x, PCRE_EXTRA);
    425   }
    426 
    427   bool ungreedy() const {
    428     return PCRE_IS_SET(PCRE_UNGREEDY);
    429   }
    430   RE_Options &set_ungreedy(bool x) {
    431     PCRE_SET_OR_CLEAR(x, PCRE_UNGREEDY);
    432   }
    433 
    434   bool utf8() const {
    435     return PCRE_IS_SET(PCRE_UTF8);
    436   }
    437   RE_Options &set_utf8(bool x) {
    438     PCRE_SET_OR_CLEAR(x, PCRE_UTF8);
    439   }
    440 
    441   bool no_auto_capture() const {
    442     return PCRE_IS_SET(PCRE_NO_AUTO_CAPTURE);
    443   }
    444   RE_Options &set_no_auto_capture(bool x) {
    445     PCRE_SET_OR_CLEAR(x, PCRE_NO_AUTO_CAPTURE);
    446   }
    447 
    448   RE_Options &set_all_options(int opt) {
    449     all_options_ = opt;
    450     return *this;
    451   }
    452   int all_options() const {
    453     return all_options_ ;
    454   }
    455 
    456   // TODO: add other pcre flags
    457 
    458  private:
    459   int match_limit_;
    460   int match_limit_recursion_;
    461   int all_options_;
    462 };
    463 
    464 // These functions return some common RE_Options
    465 static inline RE_Options UTF8() {
    466   return RE_Options().set_utf8(true);
    467 }
    468 
    469 static inline RE_Options CASELESS() {
    470   return RE_Options().set_caseless(true);
    471 }
    472 static inline RE_Options MULTILINE() {
    473   return RE_Options().set_multiline(true);
    474 }
    475 
    476 static inline RE_Options DOTALL() {
    477   return RE_Options().set_dotall(true);
    478 }
    479 
    480 static inline RE_Options EXTENDED() {
    481   return RE_Options().set_extended(true);
    482 }
    483 
    484 // Interface for regular expression matching.  Also corresponds to a
    485 // pre-compiled regular expression.  An "RE" object is safe for
    486 // concurrent use by multiple threads.
    487 class PCRECPP_EXP_DEFN RE {
    488  public:
    489   // We provide implicit conversions from strings so that users can
    490   // pass in a string or a "const char*" wherever an "RE" is expected.
    491   RE(const string& pat) { Init(pat, NULL); }
    492   RE(const string& pat, const RE_Options& option) { Init(pat, &option); }
    493   RE(const char* pat) { Init(pat, NULL); }
    494   RE(const char* pat, const RE_Options& option) { Init(pat, &option); }
    495   RE(const unsigned char* pat) {
    496     Init(reinterpret_cast<const char*>(pat), NULL);
    497   }
    498   RE(const unsigned char* pat, const RE_Options& option) {
    499     Init(reinterpret_cast<const char*>(pat), &option);
    500   }
    501 
    502   // Copy constructor & assignment - note that these are expensive
    503   // because they recompile the expression.
    504   RE(const RE& re) { Init(re.pattern_, &re.options_); }
    505   const RE& operator=(const RE& re) {
    506     if (this != &re) {
    507       Cleanup();
    508 
    509       // This is the code that originally came from Google
    510       // Init(re.pattern_.c_str(), &re.options_);
    511 
    512       // This is the replacement from Ari Pollak
    513       Init(re.pattern_, &re.options_);
    514     }
    515     return *this;
    516   }
    517 
    518 
    519   ~RE();
    520 
    521   // The string specification for this RE.  E.g.
    522   //   RE re("ab*c?d+");
    523   //   re.pattern();    // "ab*c?d+"
    524   const string& pattern() const { return pattern_; }
    525 
    526   // If RE could not be created properly, returns an error string.
    527   // Else returns the empty string.
    528   const string& error() const { return *error_; }
    529 
    530   /***** The useful part: the matching interface *****/
    531 
    532   // This is provided so one can do pattern.ReplaceAll() just as
    533   // easily as ReplaceAll(pattern-text, ....)
    534 
    535   bool FullMatch(const StringPiece& text,
    536                  const Arg& ptr1 = no_arg,
    537                  const Arg& ptr2 = no_arg,
    538                  const Arg& ptr3 = no_arg,
    539                  const Arg& ptr4 = no_arg,
    540                  const Arg& ptr5 = no_arg,
    541                  const Arg& ptr6 = no_arg,
    542                  const Arg& ptr7 = no_arg,
    543                  const Arg& ptr8 = no_arg,
    544                  const Arg& ptr9 = no_arg,
    545                  const Arg& ptr10 = no_arg,
    546                  const Arg& ptr11 = no_arg,
    547                  const Arg& ptr12 = no_arg,
    548                  const Arg& ptr13 = no_arg,
    549                  const Arg& ptr14 = no_arg,
    550                  const Arg& ptr15 = no_arg,
    551                  const Arg& ptr16 = no_arg) const;
    552 
    553   bool PartialMatch(const StringPiece& text,
    554                     const Arg& ptr1 = no_arg,
    555                     const Arg& ptr2 = no_arg,
    556                     const Arg& ptr3 = no_arg,
    557                     const Arg& ptr4 = no_arg,
    558                     const Arg& ptr5 = no_arg,
    559                     const Arg& ptr6 = no_arg,
    560                     const Arg& ptr7 = no_arg,
    561                     const Arg& ptr8 = no_arg,
    562                     const Arg& ptr9 = no_arg,
    563                     const Arg& ptr10 = no_arg,
    564                     const Arg& ptr11 = no_arg,
    565                     const Arg& ptr12 = no_arg,
    566                     const Arg& ptr13 = no_arg,
    567                     const Arg& ptr14 = no_arg,
    568                     const Arg& ptr15 = no_arg,
    569                     const Arg& ptr16 = no_arg) const;
    570 
    571   bool Consume(StringPiece* input,
    572                const Arg& ptr1 = no_arg,
    573                const Arg& ptr2 = no_arg,
    574                const Arg& ptr3 = no_arg,
    575                const Arg& ptr4 = no_arg,
    576                const Arg& ptr5 = no_arg,
    577                const Arg& ptr6 = no_arg,
    578                const Arg& ptr7 = no_arg,
    579                const Arg& ptr8 = no_arg,
    580                const Arg& ptr9 = no_arg,
    581                const Arg& ptr10 = no_arg,
    582                const Arg& ptr11 = no_arg,
    583                const Arg& ptr12 = no_arg,
    584                const Arg& ptr13 = no_arg,
    585                const Arg& ptr14 = no_arg,
    586                const Arg& ptr15 = no_arg,
    587                const Arg& ptr16 = no_arg) const;
    588 
    589   bool FindAndConsume(StringPiece* input,
    590                       const Arg& ptr1 = no_arg,
    591                       const Arg& ptr2 = no_arg,
    592                       const Arg& ptr3 = no_arg,
    593                       const Arg& ptr4 = no_arg,
    594                       const Arg& ptr5 = no_arg,
    595                       const Arg& ptr6 = no_arg,
    596                       const Arg& ptr7 = no_arg,
    597                       const Arg& ptr8 = no_arg,
    598                       const Arg& ptr9 = no_arg,
    599                       const Arg& ptr10 = no_arg,
    600                       const Arg& ptr11 = no_arg,
    601                       const Arg& ptr12 = no_arg,
    602                       const Arg& ptr13 = no_arg,
    603                       const Arg& ptr14 = no_arg,
    604                       const Arg& ptr15 = no_arg,
    605                       const Arg& ptr16 = no_arg) const;
    606 
    607   bool Replace(const StringPiece& rewrite,
    608                string *str) const;
    609 
    610   int GlobalReplace(const StringPiece& rewrite,
    611                     string *str) const;
    612 
    613   bool Extract(const StringPiece &rewrite,
    614                const StringPiece &text,
    615                string *out) const;
    616 
    617   // Escapes all potentially meaningful regexp characters in
    618   // 'unquoted'.  The returned string, used as a regular expression,
    619   // will exactly match the original string.  For example,
    620   //           1.5-2.0?
    621   // may become:
    622   //           1\.5\-2\.0\?
    623   // Note QuoteMeta behaves the same as perl's QuoteMeta function,
    624   // *except* that it escapes the NUL character (\0) as backslash + 0,
    625   // rather than backslash + NUL.
    626   static string QuoteMeta(const StringPiece& unquoted);
    627 
    628 
    629   /***** Generic matching interface *****/
    630 
    631   // Type of match (TODO: Should be restructured as part of RE_Options)
    632   enum Anchor {
    633     UNANCHORED,         // No anchoring
    634     ANCHOR_START,       // Anchor at start only
    635     ANCHOR_BOTH         // Anchor at start and end
    636   };
    637 
    638   // General matching routine.  Stores the length of the match in
    639   // "*consumed" if successful.
    640   bool DoMatch(const StringPiece& text,
    641                Anchor anchor,
    642                int* consumed,
    643                const Arg* const* args, int n) const;
    644 
    645   // Return the number of capturing subpatterns, or -1 if the
    646   // regexp wasn't valid on construction.
    647   int NumberOfCapturingGroups() const;
    648 
    649   // The default value for an argument, to indicate the end of the argument
    650   // list. This must be used only in optional argument defaults. It should NOT
    651   // be passed explicitly. Some people have tried to use it like this:
    652   //
    653   //   FullMatch(x, y, &z, no_arg, &w);
    654   //
    655   // This is a mistake, and will not work.
    656   static Arg no_arg;
    657 
    658  private:
    659 
    660   void Init(const string& pattern, const RE_Options* options);
    661   void Cleanup();
    662 
    663   // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
    664   // pairs of integers for the beginning and end positions of matched
    665   // text.  The first pair corresponds to the entire matched text;
    666   // subsequent pairs correspond, in order, to parentheses-captured
    667   // matches.  Returns the number of pairs (one more than the number of
    668   // the last subpattern with a match) if matching was successful
    669   // and zero if the match failed.
    670   // I.e. for RE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
    671   // against "foo", "bar", and "baz" respectively.
    672   // When matching RE("(foo)|hello") against "hello", it will return 1.
    673   // But the values for all subpattern are filled in into "vec".
    674   int TryMatch(const StringPiece& text,
    675                int startpos,
    676                Anchor anchor,
    677                bool empty_ok,
    678                int *vec,
    679                int vecsize) const;
    680 
    681   // Append the "rewrite" string, with backslash subsitutions from "text"
    682   // and "vec", to string "out".
    683   bool Rewrite(string *out,
    684                const StringPiece& rewrite,
    685                const StringPiece& text,
    686                int *vec,
    687                int veclen) const;
    688 
    689   // internal implementation for DoMatch
    690   bool DoMatchImpl(const StringPiece& text,
    691                    Anchor anchor,
    692                    int* consumed,
    693                    const Arg* const args[],
    694                    int n,
    695                    int* vec,
    696                    int vecsize) const;
    697 
    698   // Compile the regexp for the specified anchoring mode
    699   pcre* Compile(Anchor anchor);
    700 
    701   string        pattern_;
    702   RE_Options    options_;
    703   pcre*         re_full_;       // For full matches
    704   pcre*         re_partial_;    // For partial matches
    705   const string* error_;         // Error indicator (or points to empty string)
    706 };
    707 
    708 }   // namespace pcrecpp
    709 
    710 #endif /* _PCRECPP_H */
    711