1 // Copyright (c) 2010, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 // 30 // Author: Sanjay Ghemawat 31 32 #ifdef HAVE_CONFIG_H 33 #include "config.h" 34 #endif 35 36 #include <stdlib.h> 37 #include <stdio.h> 38 #include <ctype.h> 39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */ 40 #include <assert.h> 41 #include <errno.h> 42 #include <string> 43 #include <algorithm> 44 45 #include "pcrecpp_internal.h" 46 #include "pcre.h" 47 #include "pcrecpp.h" 48 #include "pcre_stringpiece.h" 49 50 51 namespace pcrecpp { 52 53 // Maximum number of args we can set 54 static const int kMaxArgs = 16; 55 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace 56 57 // Special object that stands-in for no argument 58 Arg RE::no_arg((void*)NULL); 59 60 // This is for ABI compatibility with old versions of pcre (pre-7.6), 61 // which defined a global no_arg variable instead of putting it in the 62 // RE class. This works on GCC >= 3, at least. It definitely works 63 // for ELF, but may not for other object formats (Mach-O, for 64 // instance, does not support aliases.) We could probably have a more 65 // inclusive test if we ever needed it. (Note that not only the 66 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are 67 // gnu-specific.) 68 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) 69 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) 70 # define ULP_AS_STRING_INTERNAL(x) #x 71 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) 72 extern Arg no_arg 73 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE"))); 74 #endif 75 76 // If a regular expression has no error, its error_ field points here 77 static const string empty_string; 78 79 // If the user doesn't ask for any options, we just use this one 80 static RE_Options default_options; 81 82 void RE::Init(const string& pat, const RE_Options* options) { 83 pattern_ = pat; 84 if (options == NULL) { 85 options_ = default_options; 86 } else { 87 options_ = *options; 88 } 89 error_ = &empty_string; 90 re_full_ = NULL; 91 re_partial_ = NULL; 92 93 re_partial_ = Compile(UNANCHORED); 94 if (re_partial_ != NULL) { 95 re_full_ = Compile(ANCHOR_BOTH); 96 } 97 } 98 99 void RE::Cleanup() { 100 if (re_full_ != NULL) (*pcre_free)(re_full_); 101 if (re_partial_ != NULL) (*pcre_free)(re_partial_); 102 if (error_ != &empty_string) delete error_; 103 } 104 105 106 RE::~RE() { 107 Cleanup(); 108 } 109 110 111 pcre* RE::Compile(Anchor anchor) { 112 // First, convert RE_Options into pcre options 113 int pcre_options = 0; 114 pcre_options = options_.all_options(); 115 116 // Special treatment for anchoring. This is needed because at 117 // runtime pcre only provides an option for anchoring at the 118 // beginning of a string (unless you use offset). 119 // 120 // There are three types of anchoring we want: 121 // UNANCHORED Compile the original pattern, and use 122 // a pcre unanchored match. 123 // ANCHOR_START Compile the original pattern, and use 124 // a pcre anchored match. 125 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern 126 // and use a pcre anchored match. 127 128 const char* compile_error; 129 int eoffset; 130 pcre* re; 131 if (anchor != ANCHOR_BOTH) { 132 re = pcre_compile(pattern_.c_str(), pcre_options, 133 &compile_error, &eoffset, NULL); 134 } else { 135 // Tack a '\z' at the end of RE. Parenthesize it first so that 136 // the '\z' applies to all top-level alternatives in the regexp. 137 string wrapped = "(?:"; // A non-counting grouping operator 138 wrapped += pattern_; 139 wrapped += ")\\z"; 140 re = pcre_compile(wrapped.c_str(), pcre_options, 141 &compile_error, &eoffset, NULL); 142 } 143 if (re == NULL) { 144 if (error_ == &empty_string) error_ = new string(compile_error); 145 } 146 return re; 147 } 148 149 /***** Matching interfaces *****/ 150 151 bool RE::FullMatch(const StringPiece& text, 152 const Arg& ptr1, 153 const Arg& ptr2, 154 const Arg& ptr3, 155 const Arg& ptr4, 156 const Arg& ptr5, 157 const Arg& ptr6, 158 const Arg& ptr7, 159 const Arg& ptr8, 160 const Arg& ptr9, 161 const Arg& ptr10, 162 const Arg& ptr11, 163 const Arg& ptr12, 164 const Arg& ptr13, 165 const Arg& ptr14, 166 const Arg& ptr15, 167 const Arg& ptr16) const { 168 const Arg* args[kMaxArgs]; 169 int n = 0; 170 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 171 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 172 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 173 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 174 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 175 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 176 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 177 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 178 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 179 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 180 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 181 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 182 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 183 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 184 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 185 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 186 done: 187 188 int consumed; 189 int vec[kVecSize]; 190 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); 191 } 192 193 bool RE::PartialMatch(const StringPiece& text, 194 const Arg& ptr1, 195 const Arg& ptr2, 196 const Arg& ptr3, 197 const Arg& ptr4, 198 const Arg& ptr5, 199 const Arg& ptr6, 200 const Arg& ptr7, 201 const Arg& ptr8, 202 const Arg& ptr9, 203 const Arg& ptr10, 204 const Arg& ptr11, 205 const Arg& ptr12, 206 const Arg& ptr13, 207 const Arg& ptr14, 208 const Arg& ptr15, 209 const Arg& ptr16) const { 210 const Arg* args[kMaxArgs]; 211 int n = 0; 212 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 213 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 214 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 215 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 216 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 217 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 218 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 219 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 220 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 221 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 222 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 223 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 224 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 225 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 226 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 227 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 228 done: 229 230 int consumed; 231 int vec[kVecSize]; 232 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); 233 } 234 235 bool RE::Consume(StringPiece* input, 236 const Arg& ptr1, 237 const Arg& ptr2, 238 const Arg& ptr3, 239 const Arg& ptr4, 240 const Arg& ptr5, 241 const Arg& ptr6, 242 const Arg& ptr7, 243 const Arg& ptr8, 244 const Arg& ptr9, 245 const Arg& ptr10, 246 const Arg& ptr11, 247 const Arg& ptr12, 248 const Arg& ptr13, 249 const Arg& ptr14, 250 const Arg& ptr15, 251 const Arg& ptr16) const { 252 const Arg* args[kMaxArgs]; 253 int n = 0; 254 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 255 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 256 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 257 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 258 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 259 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 260 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 261 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 262 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 263 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 264 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 265 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 266 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 267 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 268 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 269 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 270 done: 271 272 int consumed; 273 int vec[kVecSize]; 274 if (DoMatchImpl(*input, ANCHOR_START, &consumed, 275 args, n, vec, kVecSize)) { 276 input->remove_prefix(consumed); 277 return true; 278 } else { 279 return false; 280 } 281 } 282 283 bool RE::FindAndConsume(StringPiece* input, 284 const Arg& ptr1, 285 const Arg& ptr2, 286 const Arg& ptr3, 287 const Arg& ptr4, 288 const Arg& ptr5, 289 const Arg& ptr6, 290 const Arg& ptr7, 291 const Arg& ptr8, 292 const Arg& ptr9, 293 const Arg& ptr10, 294 const Arg& ptr11, 295 const Arg& ptr12, 296 const Arg& ptr13, 297 const Arg& ptr14, 298 const Arg& ptr15, 299 const Arg& ptr16) const { 300 const Arg* args[kMaxArgs]; 301 int n = 0; 302 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 303 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 304 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 305 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 306 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 307 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 308 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 309 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 310 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 311 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 312 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 313 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 314 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 315 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 316 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 317 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 318 done: 319 320 int consumed; 321 int vec[kVecSize]; 322 if (DoMatchImpl(*input, UNANCHORED, &consumed, 323 args, n, vec, kVecSize)) { 324 input->remove_prefix(consumed); 325 return true; 326 } else { 327 return false; 328 } 329 } 330 331 bool RE::Replace(const StringPiece& rewrite, 332 string *str) const { 333 int vec[kVecSize]; 334 int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); 335 if (matches == 0) 336 return false; 337 338 string s; 339 if (!Rewrite(&s, rewrite, *str, vec, matches)) 340 return false; 341 342 assert(vec[0] >= 0); 343 assert(vec[1] >= 0); 344 str->replace(vec[0], vec[1] - vec[0], s); 345 return true; 346 } 347 348 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. 349 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. 350 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. 351 352 static int NewlineMode(int pcre_options) { 353 // TODO: if we can make it threadsafe, cache this var 354 int newline_mode = 0; 355 /* if (newline_mode) return newline_mode; */ // do this once it's cached 356 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| 357 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { 358 newline_mode = (pcre_options & 359 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| 360 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); 361 } else { 362 int newline; 363 pcre_config(PCRE_CONFIG_NEWLINE, &newline); 364 if (newline == 10) 365 newline_mode = PCRE_NEWLINE_LF; 366 else if (newline == 13) 367 newline_mode = PCRE_NEWLINE_CR; 368 else if (newline == 3338) 369 newline_mode = PCRE_NEWLINE_CRLF; 370 else if (newline == -1) 371 newline_mode = PCRE_NEWLINE_ANY; 372 else if (newline == -2) 373 newline_mode = PCRE_NEWLINE_ANYCRLF; 374 else 375 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)"); 376 } 377 return newline_mode; 378 } 379 380 int RE::GlobalReplace(const StringPiece& rewrite, 381 string *str) const { 382 int count = 0; 383 int vec[kVecSize]; 384 string out; 385 int start = 0; 386 int lastend = -1; 387 bool last_match_was_empty_string = false; 388 389 while (start <= static_cast<int>(str->length())) { 390 // If the previous match was for the empty string, we shouldn't 391 // just match again: we'll match in the same way and get an 392 // infinite loop. Instead, we do the match in a special way: 393 // anchored -- to force another try at the same position -- 394 // and with a flag saying that this time, ignore empty matches. 395 // If this special match returns, that means there's a non-empty 396 // match at this position as well, and we can continue. If not, 397 // we do what perl does, and just advance by one. 398 // Notice that perl prints '@@@' for this; 399 // perl -le '$_ = "aa"; s/b*|aa/@/g; print' 400 int matches; 401 if (last_match_was_empty_string) { 402 matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); 403 if (matches <= 0) { 404 int matchend = start + 1; // advance one character. 405 // If the current char is CR and we're in CRLF mode, skip LF too. 406 // Note it's better to call pcre_fullinfo() than to examine 407 // all_options(), since options_ could have changed bewteen 408 // compile-time and now, but this is simpler and safe enough. 409 // Modified by PH to add ANY and ANYCRLF. 410 if (matchend < static_cast<int>(str->length()) && 411 (*str)[start] == '\r' && (*str)[matchend] == '\n' && 412 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || 413 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || 414 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) { 415 matchend++; 416 } 417 // We also need to advance more than one char if we're in utf8 mode. 418 #ifdef SUPPORT_UTF8 419 if (options_.utf8()) { 420 while (matchend < static_cast<int>(str->length()) && 421 ((*str)[matchend] & 0xc0) == 0x80) 422 matchend++; 423 } 424 #endif 425 if (start < static_cast<int>(str->length())) 426 out.append(*str, start, matchend - start); 427 start = matchend; 428 last_match_was_empty_string = false; 429 continue; 430 } 431 } else { 432 matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); 433 if (matches <= 0) 434 break; 435 } 436 int matchstart = vec[0], matchend = vec[1]; 437 assert(matchstart >= start); 438 assert(matchend >= matchstart); 439 out.append(*str, start, matchstart - start); 440 Rewrite(&out, rewrite, *str, vec, matches); 441 start = matchend; 442 lastend = matchend; 443 count++; 444 last_match_was_empty_string = (matchstart == matchend); 445 } 446 447 if (count == 0) 448 return 0; 449 450 if (start < static_cast<int>(str->length())) 451 out.append(*str, start, str->length() - start); 452 swap(out, *str); 453 return count; 454 } 455 456 bool RE::Extract(const StringPiece& rewrite, 457 const StringPiece& text, 458 string *out) const { 459 int vec[kVecSize]; 460 int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); 461 if (matches == 0) 462 return false; 463 out->erase(); 464 return Rewrite(out, rewrite, text, vec, matches); 465 } 466 467 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { 468 string result; 469 470 // Escape any ascii character not in [A-Za-z_0-9]. 471 // 472 // Note that it's legal to escape a character even if it has no 473 // special meaning in a regular expression -- so this function does 474 // that. (This also makes it identical to the perl function of the 475 // same name; see `perldoc -f quotemeta`.) The one exception is 476 // escaping NUL: rather than doing backslash + NUL, like perl does, 477 // we do '\0', because pcre itself doesn't take embedded NUL chars. 478 for (int ii = 0; ii < unquoted.size(); ++ii) { 479 // Note that using 'isalnum' here raises the benchmark time from 480 // 32ns to 58ns: 481 if (unquoted[ii] == '\0') { 482 result += "\\0"; 483 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 484 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 485 (unquoted[ii] < '0' || unquoted[ii] > '9') && 486 unquoted[ii] != '_' && 487 // If this is the part of a UTF8 or Latin1 character, we need 488 // to copy this byte without escaping. Experimentally this is 489 // what works correctly with the regexp library. 490 !(unquoted[ii] & 128)) { 491 result += '\\'; 492 result += unquoted[ii]; 493 } else { 494 result += unquoted[ii]; 495 } 496 } 497 498 return result; 499 } 500 501 /***** Actual matching and rewriting code *****/ 502 503 int RE::TryMatch(const StringPiece& text, 504 int startpos, 505 Anchor anchor, 506 bool empty_ok, 507 int *vec, 508 int vecsize) const { 509 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; 510 if (re == NULL) { 511 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str()); 512 return 0; 513 } 514 515 pcre_extra extra = { 0, 0, 0, 0, 0, 0 }; 516 if (options_.match_limit() > 0) { 517 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; 518 extra.match_limit = options_.match_limit(); 519 } 520 if (options_.match_limit_recursion() > 0) { 521 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; 522 extra.match_limit_recursion = options_.match_limit_recursion(); 523 } 524 525 int options = 0; 526 if (anchor != UNANCHORED) 527 options |= PCRE_ANCHORED; 528 if (!empty_ok) 529 options |= PCRE_NOTEMPTY; 530 531 int rc = pcre_exec(re, // The regular expression object 532 &extra, 533 (text.data() == NULL) ? "" : text.data(), 534 text.size(), 535 startpos, 536 options, 537 vec, 538 vecsize); 539 540 // Handle errors 541 if (rc == PCRE_ERROR_NOMATCH) { 542 return 0; 543 } else if (rc < 0) { 544 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n", 545 // re, pattern_.c_str()); 546 return 0; 547 } else if (rc == 0) { 548 // pcre_exec() returns 0 as a special case when the number of 549 // capturing subpatterns exceeds the size of the vector. 550 // When this happens, there is a match and the output vector 551 // is filled, but we miss out on the positions of the extra subpatterns. 552 rc = vecsize / 2; 553 } 554 555 return rc; 556 } 557 558 bool RE::DoMatchImpl(const StringPiece& text, 559 Anchor anchor, 560 int* consumed, 561 const Arg* const* args, 562 int n, 563 int* vec, 564 int vecsize) const { 565 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace 566 int matches = TryMatch(text, 0, anchor, true, vec, vecsize); 567 assert(matches >= 0); // TryMatch never returns negatives 568 if (matches == 0) 569 return false; 570 571 *consumed = vec[1]; 572 573 if (n == 0 || args == NULL) { 574 // We are not interested in results 575 return true; 576 } 577 578 if (NumberOfCapturingGroups() < n) { 579 // RE has fewer capturing groups than number of arg pointers passed in 580 return false; 581 } 582 583 // If we got here, we must have matched the whole pattern. 584 // We do not need (can not do) any more checks on the value of 'matches' here 585 // -- see the comment for TryMatch. 586 for (int i = 0; i < n; i++) { 587 const int start = vec[2*(i+1)]; 588 const int limit = vec[2*(i+1)+1]; 589 if (!args[i]->Parse(text.data() + start, limit-start)) { 590 // TODO: Should we indicate what the error was? 591 return false; 592 } 593 } 594 595 return true; 596 } 597 598 bool RE::DoMatch(const StringPiece& text, 599 Anchor anchor, 600 int* consumed, 601 const Arg* const args[], 602 int n) const { 603 assert(n >= 0); 604 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace 605 // (as for kVecSize) 606 int space[21]; // use stack allocation for small vecsize (common case) 607 int* vec = vecsize <= 21 ? space : new int[vecsize]; 608 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize); 609 if (vec != space) delete [] vec; 610 return retval; 611 } 612 613 bool RE::Rewrite(string *out, const StringPiece &rewrite, 614 const StringPiece &text, int *vec, int veclen) const { 615 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 616 s < end; s++) { 617 int c = *s; 618 if (c == '\\') { 619 c = *++s; 620 if (isdigit(c)) { 621 int n = (c - '0'); 622 if (n >= veclen) { 623 //fprintf(stderr, requested group %d in regexp %.*s\n", 624 // n, rewrite.size(), rewrite.data()); 625 return false; 626 } 627 int start = vec[2 * n]; 628 if (start >= 0) 629 out->append(text.data() + start, vec[2 * n + 1] - start); 630 } else if (c == '\\') { 631 *out += '\\'; 632 } else { 633 //fprintf(stderr, "invalid rewrite pattern: %.*s\n", 634 // rewrite.size(), rewrite.data()); 635 return false; 636 } 637 } else { 638 *out += c; 639 } 640 } 641 return true; 642 } 643 644 // Return the number of capturing subpatterns, or -1 if the 645 // regexp wasn't valid on construction. 646 int RE::NumberOfCapturingGroups() const { 647 if (re_partial_ == NULL) return -1; 648 649 int result; 650 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object 651 NULL, // We did not study the pattern 652 PCRE_INFO_CAPTURECOUNT, 653 &result); 654 assert(pcre_retval == 0); 655 return result; 656 } 657 658 /***** Parsers for various types *****/ 659 660 bool Arg::parse_null(const char* str, int n, void* dest) { 661 // We fail if somebody asked us to store into a non-NULL void* pointer 662 return (dest == NULL); 663 } 664 665 bool Arg::parse_string(const char* str, int n, void* dest) { 666 if (dest == NULL) return true; 667 reinterpret_cast<string*>(dest)->assign(str, n); 668 return true; 669 } 670 671 bool Arg::parse_stringpiece(const char* str, int n, void* dest) { 672 if (dest == NULL) return true; 673 reinterpret_cast<StringPiece*>(dest)->set(str, n); 674 return true; 675 } 676 677 bool Arg::parse_char(const char* str, int n, void* dest) { 678 if (n != 1) return false; 679 if (dest == NULL) return true; 680 *(reinterpret_cast<char*>(dest)) = str[0]; 681 return true; 682 } 683 684 bool Arg::parse_uchar(const char* str, int n, void* dest) { 685 if (n != 1) return false; 686 if (dest == NULL) return true; 687 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 688 return true; 689 } 690 691 // Largest number spec that we are willing to parse 692 static const int kMaxNumberLength = 32; 693 694 // REQUIRES "buf" must have length at least kMaxNumberLength+1 695 // REQUIRES "n > 0" 696 // Copies "str" into "buf" and null-terminates if necessary. 697 // Returns one of: 698 // a. "str" if no termination is needed 699 // b. "buf" if the string was copied and null-terminated 700 // c. "" if the input was invalid and has no hope of being parsed 701 static const char* TerminateNumber(char* buf, const char* str, int n) { 702 if ((n > 0) && isspace(*str)) { 703 // We are less forgiving than the strtoxxx() routines and do not 704 // allow leading spaces. 705 return ""; 706 } 707 708 // See if the character right after the input text may potentially 709 // look like a digit. 710 if (isdigit(str[n]) || 711 ((str[n] >= 'a') && (str[n] <= 'f')) || 712 ((str[n] >= 'A') && (str[n] <= 'F'))) { 713 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number 714 memcpy(buf, str, n); 715 buf[n] = '\0'; 716 return buf; 717 } else { 718 // We can parse right out of the supplied string, so return it. 719 return str; 720 } 721 } 722 723 bool Arg::parse_long_radix(const char* str, 724 int n, 725 void* dest, 726 int radix) { 727 if (n == 0) return false; 728 char buf[kMaxNumberLength+1]; 729 str = TerminateNumber(buf, str, n); 730 char* end; 731 errno = 0; 732 long r = strtol(str, &end, radix); 733 if (end != str + n) return false; // Leftover junk 734 if (errno) return false; 735 if (dest == NULL) return true; 736 *(reinterpret_cast<long*>(dest)) = r; 737 return true; 738 } 739 740 bool Arg::parse_ulong_radix(const char* str, 741 int n, 742 void* dest, 743 int radix) { 744 if (n == 0) return false; 745 char buf[kMaxNumberLength+1]; 746 str = TerminateNumber(buf, str, n); 747 if (str[0] == '-') return false; // strtoul() on a negative number?! 748 char* end; 749 errno = 0; 750 unsigned long r = strtoul(str, &end, radix); 751 if (end != str + n) return false; // Leftover junk 752 if (errno) return false; 753 if (dest == NULL) return true; 754 *(reinterpret_cast<unsigned long*>(dest)) = r; 755 return true; 756 } 757 758 bool Arg::parse_short_radix(const char* str, 759 int n, 760 void* dest, 761 int radix) { 762 long r; 763 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 764 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range 765 if (dest == NULL) return true; 766 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r); 767 return true; 768 } 769 770 bool Arg::parse_ushort_radix(const char* str, 771 int n, 772 void* dest, 773 int radix) { 774 unsigned long r; 775 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 776 if (r > USHRT_MAX) return false; // Out of range 777 if (dest == NULL) return true; 778 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r); 779 return true; 780 } 781 782 bool Arg::parse_int_radix(const char* str, 783 int n, 784 void* dest, 785 int radix) { 786 long r; 787 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 788 if (r < INT_MIN || r > INT_MAX) return false; // Out of range 789 if (dest == NULL) return true; 790 *(reinterpret_cast<int*>(dest)) = r; 791 return true; 792 } 793 794 bool Arg::parse_uint_radix(const char* str, 795 int n, 796 void* dest, 797 int radix) { 798 unsigned long r; 799 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 800 if (r > UINT_MAX) return false; // Out of range 801 if (dest == NULL) return true; 802 *(reinterpret_cast<unsigned int*>(dest)) = r; 803 return true; 804 } 805 806 bool Arg::parse_longlong_radix(const char* str, 807 int n, 808 void* dest, 809 int radix) { 810 #ifndef HAVE_LONG_LONG 811 return false; 812 #else 813 if (n == 0) return false; 814 char buf[kMaxNumberLength+1]; 815 str = TerminateNumber(buf, str, n); 816 char* end; 817 errno = 0; 818 #if defined HAVE_STRTOQ 819 long long r = strtoq(str, &end, radix); 820 #elif defined HAVE_STRTOLL 821 long long r = strtoll(str, &end, radix); 822 #elif defined HAVE__STRTOI64 823 long long r = _strtoi64(str, &end, radix); 824 #elif defined HAVE_STRTOIMAX 825 long long r = strtoimax(str, &end, radix); 826 #else 827 #error parse_longlong_radix: cannot convert input to a long-long 828 #endif 829 if (end != str + n) return false; // Leftover junk 830 if (errno) return false; 831 if (dest == NULL) return true; 832 *(reinterpret_cast<long long*>(dest)) = r; 833 return true; 834 #endif /* HAVE_LONG_LONG */ 835 } 836 837 bool Arg::parse_ulonglong_radix(const char* str, 838 int n, 839 void* dest, 840 int radix) { 841 #ifndef HAVE_UNSIGNED_LONG_LONG 842 return false; 843 #else 844 if (n == 0) return false; 845 char buf[kMaxNumberLength+1]; 846 str = TerminateNumber(buf, str, n); 847 if (str[0] == '-') return false; // strtoull() on a negative number?! 848 char* end; 849 errno = 0; 850 #if defined HAVE_STRTOQ 851 unsigned long long r = strtouq(str, &end, radix); 852 #elif defined HAVE_STRTOLL 853 unsigned long long r = strtoull(str, &end, radix); 854 #elif defined HAVE__STRTOI64 855 unsigned long long r = _strtoui64(str, &end, radix); 856 #elif defined HAVE_STRTOIMAX 857 unsigned long long r = strtoumax(str, &end, radix); 858 #else 859 #error parse_ulonglong_radix: cannot convert input to a long-long 860 #endif 861 if (end != str + n) return false; // Leftover junk 862 if (errno) return false; 863 if (dest == NULL) return true; 864 *(reinterpret_cast<unsigned long long*>(dest)) = r; 865 return true; 866 #endif /* HAVE_UNSIGNED_LONG_LONG */ 867 } 868 869 bool Arg::parse_double(const char* str, int n, void* dest) { 870 if (n == 0) return false; 871 static const int kMaxLength = 200; 872 char buf[kMaxLength]; 873 if (n >= kMaxLength) return false; 874 memcpy(buf, str, n); 875 buf[n] = '\0'; 876 errno = 0; 877 char* end; 878 double r = strtod(buf, &end); 879 if (end != buf + n) return false; // Leftover junk 880 if (errno) return false; 881 if (dest == NULL) return true; 882 *(reinterpret_cast<double*>(dest)) = r; 883 return true; 884 } 885 886 bool Arg::parse_float(const char* str, int n, void* dest) { 887 double r; 888 if (!parse_double(str, n, &r)) return false; 889 if (dest == NULL) return true; 890 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); 891 return true; 892 } 893 894 895 #define DEFINE_INTEGER_PARSERS(name) \ 896 bool Arg::parse_##name(const char* str, int n, void* dest) { \ 897 return parse_##name##_radix(str, n, dest, 10); \ 898 } \ 899 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 900 return parse_##name##_radix(str, n, dest, 16); \ 901 } \ 902 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 903 return parse_##name##_radix(str, n, dest, 8); \ 904 } \ 905 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 906 return parse_##name##_radix(str, n, dest, 0); \ 907 } 908 909 DEFINE_INTEGER_PARSERS(short) /* */ 910 DEFINE_INTEGER_PARSERS(ushort) /* */ 911 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ 912 DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ 913 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ 914 DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ 915 DEFINE_INTEGER_PARSERS(longlong) /* */ 916 DEFINE_INTEGER_PARSERS(ulonglong) /* */ 917 918 #undef DEFINE_INTEGER_PARSERS 919 920 } // namespace pcrecpp 921