1 // Copyright 2003-2009 Google Inc. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google. 6 // The main changes are the addition of the HitLimit method and 7 // compilation as PCRE in namespace re2. 8 9 #include <errno.h> 10 #include "util/util.h" 11 #include "util/flags.h" 12 #include "util/pcre.h" 13 14 #ifdef WIN32 15 #define strtoll _strtoi64 16 #define strtoull _strtoui64 17 #endif 18 19 #define PCREPORT(level) LOG(level) 20 21 // Default PCRE limits. 22 // Defaults chosen to allow a plausible amount of CPU and 23 // not exceed main thread stacks. Note that other threads 24 // often have smaller stacks, and therefore tightening 25 // regexp_stack_limit may frequently be necessary. 26 DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); 27 DEFINE_int32(regexp_match_limit, 1000000, 28 "default PCRE match limit (function calls)"); 29 30 namespace re2 { 31 32 // Maximum number of args we can set 33 static const int kMaxArgs = 16; 34 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace 35 36 // Approximate size of a recursive invocation of PCRE's 37 // internal "match()" frame. This varies depending on the 38 // compiler and architecture, of course, so the constant is 39 // just a conservative estimate. To find the exact number, 40 // run regexp_unittest with --regexp_stack_limit=0 under 41 // a debugger and look at the frames when it crashes. 42 // The exact frame size was 656 in production on 2008/02/03. 43 static const int kPCREFrameSize = 700; 44 45 // Special name for missing C++ arguments. 46 PCRE::Arg PCRE::no_more_args((void*)NULL); 47 48 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; 49 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; 50 const PCRE::ConsumeFunctor PCRE::Consume = { }; 51 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; 52 53 // If a regular expression has no error, its error_ field points here 54 static const string empty_string; 55 56 void PCRE::Init(const char* pattern, Option options, int match_limit, 57 int stack_limit, bool report_errors) { 58 pattern_ = pattern; 59 options_ = options; 60 match_limit_ = match_limit; 61 stack_limit_ = stack_limit; 62 hit_limit_ = false; 63 error_ = &empty_string; 64 report_errors_ = report_errors; 65 re_full_ = NULL; 66 re_partial_ = NULL; 67 68 if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { 69 error_ = new string("illegal regexp option"); 70 PCREPORT(ERROR) 71 << "Error compiling '" << pattern << "': illegal regexp option"; 72 } else { 73 re_partial_ = Compile(UNANCHORED); 74 if (re_partial_ != NULL) { 75 re_full_ = Compile(ANCHOR_BOTH); 76 } 77 } 78 } 79 80 PCRE::PCRE(const char* pattern) { 81 Init(pattern, None, 0, 0, true); 82 } 83 PCRE::PCRE(const char* pattern, Option option) { 84 Init(pattern, option, 0, 0, true); 85 } 86 PCRE::PCRE(const string& pattern) { 87 Init(pattern.c_str(), None, 0, 0, true); 88 } 89 PCRE::PCRE(const string& pattern, Option option) { 90 Init(pattern.c_str(), option, 0, 0, true); 91 } 92 PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) { 93 Init(pattern.c_str(), re_option.option(), re_option.match_limit(), 94 re_option.stack_limit(), re_option.report_errors()); 95 } 96 97 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { 98 Init(pattern, re_option.option(), re_option.match_limit(), 99 re_option.stack_limit(), re_option.report_errors()); 100 } 101 102 PCRE::~PCRE() { 103 if (re_full_ != NULL) pcre_free(re_full_); 104 if (re_partial_ != NULL) pcre_free(re_partial_); 105 if (error_ != &empty_string) delete error_; 106 } 107 108 pcre* PCRE::Compile(Anchor anchor) { 109 // Special treatment for anchoring. This is needed because at 110 // runtime pcre only provides an option for anchoring at the 111 // beginning of a string. 112 // 113 // There are three types of anchoring we want: 114 // UNANCHORED Compile the original pattern, and use 115 // a pcre unanchored match. 116 // ANCHOR_START Compile the original pattern, and use 117 // a pcre anchored match. 118 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern 119 // and use a pcre anchored match. 120 121 const char* error; 122 int eoffset; 123 pcre* re; 124 if (anchor != ANCHOR_BOTH) { 125 re = pcre_compile(pattern_.c_str(), 126 (options_ & EnabledCompileOptions), 127 &error, &eoffset, NULL); 128 } else { 129 // Tack a '\z' at the end of PCRE. Parenthesize it first so that 130 // the '\z' applies to all top-level alternatives in the regexp. 131 string wrapped = "(?:"; // A non-counting grouping operator 132 wrapped += pattern_; 133 wrapped += ")\\z"; 134 re = pcre_compile(wrapped.c_str(), 135 (options_ & EnabledCompileOptions), 136 &error, &eoffset, NULL); 137 } 138 if (re == NULL) { 139 if (error_ == &empty_string) error_ = new string(error); 140 PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; 141 } 142 return re; 143 } 144 145 /***** Convenience interfaces *****/ 146 147 bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, 148 const PCRE& re, 149 const Arg& a0, 150 const Arg& a1, 151 const Arg& a2, 152 const Arg& a3, 153 const Arg& a4, 154 const Arg& a5, 155 const Arg& a6, 156 const Arg& a7, 157 const Arg& a8, 158 const Arg& a9, 159 const Arg& a10, 160 const Arg& a11, 161 const Arg& a12, 162 const Arg& a13, 163 const Arg& a14, 164 const Arg& a15) const { 165 const Arg* args[kMaxArgs]; 166 int n = 0; 167 if (&a0 == &no_more_args) goto done; args[n++] = &a0; 168 if (&a1 == &no_more_args) goto done; args[n++] = &a1; 169 if (&a2 == &no_more_args) goto done; args[n++] = &a2; 170 if (&a3 == &no_more_args) goto done; args[n++] = &a3; 171 if (&a4 == &no_more_args) goto done; args[n++] = &a4; 172 if (&a5 == &no_more_args) goto done; args[n++] = &a5; 173 if (&a6 == &no_more_args) goto done; args[n++] = &a6; 174 if (&a7 == &no_more_args) goto done; args[n++] = &a7; 175 if (&a8 == &no_more_args) goto done; args[n++] = &a8; 176 if (&a9 == &no_more_args) goto done; args[n++] = &a9; 177 if (&a10 == &no_more_args) goto done; args[n++] = &a10; 178 if (&a11 == &no_more_args) goto done; args[n++] = &a11; 179 if (&a12 == &no_more_args) goto done; args[n++] = &a12; 180 if (&a13 == &no_more_args) goto done; args[n++] = &a13; 181 if (&a14 == &no_more_args) goto done; args[n++] = &a14; 182 if (&a15 == &no_more_args) goto done; args[n++] = &a15; 183 done: 184 185 int consumed; 186 int vec[kVecSize]; 187 return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); 188 } 189 190 bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, 191 const PCRE& re, 192 const Arg& a0, 193 const Arg& a1, 194 const Arg& a2, 195 const Arg& a3, 196 const Arg& a4, 197 const Arg& a5, 198 const Arg& a6, 199 const Arg& a7, 200 const Arg& a8, 201 const Arg& a9, 202 const Arg& a10, 203 const Arg& a11, 204 const Arg& a12, 205 const Arg& a13, 206 const Arg& a14, 207 const Arg& a15) const { 208 const Arg* args[kMaxArgs]; 209 int n = 0; 210 if (&a0 == &no_more_args) goto done; args[n++] = &a0; 211 if (&a1 == &no_more_args) goto done; args[n++] = &a1; 212 if (&a2 == &no_more_args) goto done; args[n++] = &a2; 213 if (&a3 == &no_more_args) goto done; args[n++] = &a3; 214 if (&a4 == &no_more_args) goto done; args[n++] = &a4; 215 if (&a5 == &no_more_args) goto done; args[n++] = &a5; 216 if (&a6 == &no_more_args) goto done; args[n++] = &a6; 217 if (&a7 == &no_more_args) goto done; args[n++] = &a7; 218 if (&a8 == &no_more_args) goto done; args[n++] = &a8; 219 if (&a9 == &no_more_args) goto done; args[n++] = &a9; 220 if (&a10 == &no_more_args) goto done; args[n++] = &a10; 221 if (&a11 == &no_more_args) goto done; args[n++] = &a11; 222 if (&a12 == &no_more_args) goto done; args[n++] = &a12; 223 if (&a13 == &no_more_args) goto done; args[n++] = &a13; 224 if (&a14 == &no_more_args) goto done; args[n++] = &a14; 225 if (&a15 == &no_more_args) goto done; args[n++] = &a15; 226 done: 227 228 int consumed; 229 int vec[kVecSize]; 230 return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); 231 } 232 233 bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, 234 const PCRE& pattern, 235 const Arg& a0, 236 const Arg& a1, 237 const Arg& a2, 238 const Arg& a3, 239 const Arg& a4, 240 const Arg& a5, 241 const Arg& a6, 242 const Arg& a7, 243 const Arg& a8, 244 const Arg& a9, 245 const Arg& a10, 246 const Arg& a11, 247 const Arg& a12, 248 const Arg& a13, 249 const Arg& a14, 250 const Arg& a15) const { 251 const Arg* args[kMaxArgs]; 252 int n = 0; 253 if (&a0 == &no_more_args) goto done; args[n++] = &a0; 254 if (&a1 == &no_more_args) goto done; args[n++] = &a1; 255 if (&a2 == &no_more_args) goto done; args[n++] = &a2; 256 if (&a3 == &no_more_args) goto done; args[n++] = &a3; 257 if (&a4 == &no_more_args) goto done; args[n++] = &a4; 258 if (&a5 == &no_more_args) goto done; args[n++] = &a5; 259 if (&a6 == &no_more_args) goto done; args[n++] = &a6; 260 if (&a7 == &no_more_args) goto done; args[n++] = &a7; 261 if (&a8 == &no_more_args) goto done; args[n++] = &a8; 262 if (&a9 == &no_more_args) goto done; args[n++] = &a9; 263 if (&a10 == &no_more_args) goto done; args[n++] = &a10; 264 if (&a11 == &no_more_args) goto done; args[n++] = &a11; 265 if (&a12 == &no_more_args) goto done; args[n++] = &a12; 266 if (&a13 == &no_more_args) goto done; args[n++] = &a13; 267 if (&a14 == &no_more_args) goto done; args[n++] = &a14; 268 if (&a15 == &no_more_args) goto done; args[n++] = &a15; 269 done: 270 271 int consumed; 272 int vec[kVecSize]; 273 if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, 274 args, n, vec, kVecSize)) { 275 input->remove_prefix(consumed); 276 return true; 277 } else { 278 return false; 279 } 280 } 281 282 bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, 283 const PCRE& pattern, 284 const Arg& a0, 285 const Arg& a1, 286 const Arg& a2, 287 const Arg& a3, 288 const Arg& a4, 289 const Arg& a5, 290 const Arg& a6, 291 const Arg& a7, 292 const Arg& a8, 293 const Arg& a9, 294 const Arg& a10, 295 const Arg& a11, 296 const Arg& a12, 297 const Arg& a13, 298 const Arg& a14, 299 const Arg& a15) const { 300 const Arg* args[kMaxArgs]; 301 int n = 0; 302 if (&a0 == &no_more_args) goto done; args[n++] = &a0; 303 if (&a1 == &no_more_args) goto done; args[n++] = &a1; 304 if (&a2 == &no_more_args) goto done; args[n++] = &a2; 305 if (&a3 == &no_more_args) goto done; args[n++] = &a3; 306 if (&a4 == &no_more_args) goto done; args[n++] = &a4; 307 if (&a5 == &no_more_args) goto done; args[n++] = &a5; 308 if (&a6 == &no_more_args) goto done; args[n++] = &a6; 309 if (&a7 == &no_more_args) goto done; args[n++] = &a7; 310 if (&a8 == &no_more_args) goto done; args[n++] = &a8; 311 if (&a9 == &no_more_args) goto done; args[n++] = &a9; 312 if (&a10 == &no_more_args) goto done; args[n++] = &a10; 313 if (&a11 == &no_more_args) goto done; args[n++] = &a11; 314 if (&a12 == &no_more_args) goto done; args[n++] = &a12; 315 if (&a13 == &no_more_args) goto done; args[n++] = &a13; 316 if (&a14 == &no_more_args) goto done; args[n++] = &a14; 317 if (&a15 == &no_more_args) goto done; args[n++] = &a15; 318 done: 319 320 int consumed; 321 int vec[kVecSize]; 322 if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, 323 args, n, vec, kVecSize)) { 324 input->remove_prefix(consumed); 325 return true; 326 } else { 327 return false; 328 } 329 } 330 331 bool PCRE::Replace(string *str, 332 const PCRE& pattern, 333 const StringPiece& rewrite) { 334 int vec[kVecSize]; 335 int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); 336 if (matches == 0) 337 return false; 338 339 string s; 340 if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) 341 return false; 342 343 assert(vec[0] >= 0); 344 assert(vec[1] >= 0); 345 str->replace(vec[0], vec[1] - vec[0], s); 346 return true; 347 } 348 349 int PCRE::GlobalReplace(string *str, 350 const PCRE& pattern, 351 const StringPiece& rewrite) { 352 int count = 0; 353 int vec[kVecSize]; 354 string out; 355 int start = 0; 356 bool last_match_was_empty_string = false; 357 358 for (; start <= str->length();) { 359 // If the previous match was for the empty string, we shouldn't 360 // just match again: we'll match in the same way and get an 361 // infinite loop. Instead, we do the match in a special way: 362 // anchored -- to force another try at the same position -- 363 // and with a flag saying that this time, ignore empty matches. 364 // If this special match returns, that means there's a non-empty 365 // match at this position as well, and we can continue. If not, 366 // we do what perl does, and just advance by one. 367 // Notice that perl prints '@@@' for this; 368 // perl -le '$_ = "aa"; s/b*|aa/@/g; print' 369 int matches; 370 if (last_match_was_empty_string) { 371 matches = pattern.TryMatch(*str, start, ANCHOR_START, false, 372 vec, kVecSize); 373 if (matches <= 0) { 374 if (start < str->length()) 375 out.push_back((*str)[start]); 376 start++; 377 last_match_was_empty_string = false; 378 continue; 379 } 380 } else { 381 matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); 382 if (matches <= 0) 383 break; 384 } 385 int matchstart = vec[0], matchend = vec[1]; 386 assert(matchstart >= start); 387 assert(matchend >= matchstart); 388 389 out.append(*str, start, matchstart - start); 390 pattern.Rewrite(&out, rewrite, *str, vec, matches); 391 start = matchend; 392 count++; 393 last_match_was_empty_string = (matchstart == matchend); 394 } 395 396 if (count == 0) 397 return 0; 398 399 if (start < str->length()) 400 out.append(*str, start, str->length() - start); 401 swap(out, *str); 402 return count; 403 } 404 405 bool PCRE::Extract(const StringPiece &text, 406 const PCRE& pattern, 407 const StringPiece &rewrite, 408 string *out) { 409 int vec[kVecSize]; 410 int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); 411 if (matches == 0) 412 return false; 413 out->clear(); 414 return pattern.Rewrite(out, rewrite, text, vec, matches); 415 } 416 417 string PCRE::QuoteMeta(const StringPiece& unquoted) { 418 string result; 419 result.reserve(unquoted.size() << 1); 420 421 // Escape any ascii character not in [A-Za-z_0-9]. 422 // 423 // Note that it's legal to escape a character even if it has no 424 // special meaning in a regular expression -- so this function does 425 // that. (This also makes it identical to the perl function of the 426 // same name except for the null-character special case; 427 // see `perldoc -f quotemeta`.) 428 for (int ii = 0; ii < unquoted.length(); ++ii) { 429 // Note that using 'isalnum' here raises the benchmark time from 430 // 32ns to 58ns: 431 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 432 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 433 (unquoted[ii] < '0' || unquoted[ii] > '9') && 434 unquoted[ii] != '_' && 435 // If this is the part of a UTF8 or Latin1 character, we need 436 // to copy this byte without escaping. Experimentally this is 437 // what works correctly with the regexp library. 438 !(unquoted[ii] & 128)) { 439 if (unquoted[ii] == '\0') { // Special handling for null chars. 440 // Can't use "\\0" since the next character might be a digit. 441 result += "\\x00"; 442 continue; 443 } 444 result += '\\'; 445 } 446 result += unquoted[ii]; 447 } 448 449 return result; 450 } 451 452 /***** Actual matching and rewriting code *****/ 453 454 bool PCRE::HitLimit() { 455 return hit_limit_; 456 } 457 458 void PCRE::ClearHitLimit() { 459 hit_limit_ = 0; 460 } 461 462 int PCRE::TryMatch(const StringPiece& text, 463 int startpos, 464 Anchor anchor, 465 bool empty_ok, 466 int *vec, 467 int vecsize) const { 468 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; 469 if (re == NULL) { 470 PCREPORT(ERROR) << "Matching against invalid re: " << *error_; 471 return 0; 472 } 473 474 int match_limit = match_limit_; 475 if (match_limit <= 0) { 476 match_limit = FLAGS_regexp_match_limit; 477 } 478 479 int stack_limit = stack_limit_; 480 if (stack_limit <= 0) { 481 stack_limit = FLAGS_regexp_stack_limit; 482 } 483 484 pcre_extra extra = { 0 }; 485 if (match_limit > 0) { 486 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; 487 extra.match_limit = match_limit; 488 } 489 if (stack_limit > 0) { 490 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; 491 extra.match_limit_recursion = stack_limit / kPCREFrameSize; 492 } 493 494 int options = 0; 495 if (anchor != UNANCHORED) 496 options |= PCRE_ANCHORED; 497 if (!empty_ok) 498 options |= PCRE_NOTEMPTY; 499 500 int rc = pcre_exec(re, // The regular expression object 501 &extra, 502 (text.data() == NULL) ? "" : text.data(), 503 text.size(), 504 startpos, 505 options, 506 vec, 507 vecsize); 508 509 // Handle errors 510 if (rc == 0) { 511 // pcre_exec() returns 0 as a special case when the number of 512 // capturing subpatterns exceeds the size of the vector. 513 // When this happens, there is a match and the output vector 514 // is filled, but we miss out on the positions of the extra subpatterns. 515 rc = vecsize / 2; 516 } else if (rc < 0) { 517 switch (rc) { 518 case PCRE_ERROR_NOMATCH: 519 return 0; 520 case PCRE_ERROR_MATCHLIMIT: 521 // Writing to hit_limit is not safe if multiple threads 522 // are using the PCRE, but the flag is only intended 523 // for use by unit tests anyway, so we let it go. 524 hit_limit_ = true; 525 PCREPORT(WARNING) << "Exceeded match limit of " << match_limit 526 << " when matching '" << pattern_ << "'" 527 << " against text that is " << text.size() << " bytes."; 528 return 0; 529 case PCRE_ERROR_RECURSIONLIMIT: 530 // See comment about hit_limit above. 531 hit_limit_ = true; 532 PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit 533 << " when matching '" << pattern_ << "'" 534 << " against text that is " << text.size() << " bytes."; 535 return 0; 536 default: 537 // There are other return codes from pcre.h : 538 // PCRE_ERROR_NULL (-2) 539 // PCRE_ERROR_BADOPTION (-3) 540 // PCRE_ERROR_BADMAGIC (-4) 541 // PCRE_ERROR_UNKNOWN_NODE (-5) 542 // PCRE_ERROR_NOMEMORY (-6) 543 // PCRE_ERROR_NOSUBSTRING (-7) 544 // ... 545 PCREPORT(ERROR) << "Unexpected return code: " << rc 546 << " when matching '" << pattern_ << "'" 547 << ", re=" << re 548 << ", text=" << text 549 << ", vec=" << vec 550 << ", vecsize=" << vecsize; 551 return 0; 552 } 553 } 554 555 return rc; 556 } 557 558 bool PCRE::DoMatchImpl(const StringPiece& text, 559 Anchor anchor, 560 int* consumed, 561 const Arg* const* args, 562 int n, 563 int* vec, 564 int vecsize) const { 565 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace 566 int matches = TryMatch(text, 0, anchor, true, vec, vecsize); 567 assert(matches >= 0); // TryMatch never returns negatives 568 if (matches == 0) 569 return false; 570 571 *consumed = vec[1]; 572 573 if (n == 0 || args == NULL) { 574 // We are not interested in results 575 return true; 576 } 577 if (NumberOfCapturingGroups() < n) { 578 // PCRE has fewer capturing groups than number of arg pointers passed in 579 return false; 580 } 581 582 // If we got here, we must have matched the whole pattern. 583 // We do not need (can not do) any more checks on the value of 'matches' here 584 // -- see the comment for TryMatch. 585 for (int i = 0; i < n; i++) { 586 const int start = vec[2*(i+1)]; 587 const int limit = vec[2*(i+1)+1]; 588 if (!args[i]->Parse(text.data() + start, limit-start)) { 589 // TODO: Should we indicate what the error was? 590 return false; 591 } 592 } 593 594 return true; 595 } 596 597 bool PCRE::DoMatch(const StringPiece& text, 598 Anchor anchor, 599 int* consumed, 600 const Arg* const args[], 601 int n) const { 602 assert(n >= 0); 603 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace 604 // (as for kVecSize) 605 int *vec = new int[vecsize]; 606 bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); 607 delete[] vec; 608 return b; 609 } 610 611 bool PCRE::Rewrite(string *out, const StringPiece &rewrite, 612 const StringPiece &text, int *vec, int veclen) const { 613 int number_of_capturing_groups = NumberOfCapturingGroups(); 614 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 615 s < end; s++) { 616 int c = *s; 617 if (c == '\\') { 618 c = *++s; 619 if (isdigit(c)) { 620 int n = (c - '0'); 621 if (n >= veclen) { 622 if (n <= number_of_capturing_groups) { 623 // unmatched optional capturing group. treat 624 // its value as empty string; i.e., nothing to append. 625 } else { 626 PCREPORT(ERROR) << "requested group " << n 627 << " in regexp " << rewrite.data(); 628 return false; 629 } 630 } 631 int start = vec[2 * n]; 632 if (start >= 0) 633 out->append(text.data() + start, vec[2 * n + 1] - start); 634 } else if (c == '\\') { 635 out->push_back('\\'); 636 } else { 637 PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); 638 return false; 639 } 640 } else { 641 out->push_back(c); 642 } 643 } 644 return true; 645 } 646 647 bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const { 648 int max_token = -1; 649 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 650 s < end; s++) { 651 int c = *s; 652 if (c != '\\') { 653 continue; 654 } 655 if (++s == end) { 656 *error = "Rewrite schema error: '\\' not allowed at end."; 657 return false; 658 } 659 c = *s; 660 if (c == '\\') { 661 continue; 662 } 663 if (!isdigit(c)) { 664 *error = "Rewrite schema error: " 665 "'\\' must be followed by a digit or '\\'."; 666 return false; 667 } 668 int n = (c - '0'); 669 if (max_token < n) { 670 max_token = n; 671 } 672 } 673 674 if (max_token > NumberOfCapturingGroups()) { 675 SStringPrintf(error, "Rewrite schema requests %d matches, " 676 "but the regexp only has %d parenthesized subexpressions.", 677 max_token, NumberOfCapturingGroups()); 678 return false; 679 } 680 return true; 681 } 682 683 684 // Return the number of capturing subpatterns, or -1 if the 685 // regexp wasn't valid on construction. 686 int PCRE::NumberOfCapturingGroups() const { 687 if (re_partial_ == NULL) return -1; 688 689 int result; 690 CHECK(pcre_fullinfo(re_partial_, // The regular expression object 691 NULL, // We did not study the pattern 692 PCRE_INFO_CAPTURECOUNT, 693 &result) == 0); 694 return result; 695 } 696 697 698 /***** Parsers for various types *****/ 699 700 bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { 701 // We fail if somebody asked us to store into a non-NULL void* pointer 702 return (dest == NULL); 703 } 704 705 bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { 706 if (dest == NULL) return true; 707 reinterpret_cast<string*>(dest)->assign(str, n); 708 return true; 709 } 710 711 bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { 712 if (dest == NULL) return true; 713 reinterpret_cast<StringPiece*>(dest)->set(str, n); 714 return true; 715 } 716 717 bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { 718 if (n != 1) return false; 719 if (dest == NULL) return true; 720 *(reinterpret_cast<char*>(dest)) = str[0]; 721 return true; 722 } 723 724 bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { 725 if (n != 1) return false; 726 if (dest == NULL) return true; 727 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 728 return true; 729 } 730 731 // Largest number spec that we are willing to parse 732 static const int kMaxNumberLength = 32; 733 734 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 735 // PCREQUIPCRES "n > 0" 736 // Copies "str" into "buf" and null-terminates if necessary. 737 // Returns one of: 738 // a. "str" if no termination is needed 739 // b. "buf" if the string was copied and null-terminated 740 // c. "" if the input was invalid and has no hope of being parsed 741 static const char* TerminateNumber(char* buf, const char* str, int n) { 742 if ((n > 0) && isspace(*str)) { 743 // We are less forgiving than the strtoxxx() routines and do not 744 // allow leading spaces. 745 return ""; 746 } 747 748 // See if the character right after the input text may potentially 749 // look like a digit. 750 if (isdigit(str[n]) || 751 ((str[n] >= 'a') && (str[n] <= 'f')) || 752 ((str[n] >= 'A') && (str[n] <= 'F'))) { 753 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number 754 memcpy(buf, str, n); 755 buf[n] = '\0'; 756 return buf; 757 } else { 758 // We can parse right out of the supplied string, so return it. 759 return str; 760 } 761 } 762 763 bool PCRE::Arg::parse_long_radix(const char* str, 764 int n, 765 void* dest, 766 int radix) { 767 if (n == 0) return false; 768 char buf[kMaxNumberLength+1]; 769 str = TerminateNumber(buf, str, n); 770 char* end; 771 errno = 0; 772 long r = strtol(str, &end, radix); 773 if (end != str + n) return false; // Leftover junk 774 if (errno) return false; 775 if (dest == NULL) return true; 776 *(reinterpret_cast<long*>(dest)) = r; 777 return true; 778 } 779 780 bool PCRE::Arg::parse_ulong_radix(const char* str, 781 int n, 782 void* dest, 783 int radix) { 784 if (n == 0) return false; 785 char buf[kMaxNumberLength+1]; 786 str = TerminateNumber(buf, str, n); 787 if (str[0] == '-') { 788 // strtoul() will silently accept negative numbers and parse 789 // them. This module is more strict and treats them as errors. 790 return false; 791 } 792 793 char* end; 794 errno = 0; 795 unsigned long r = strtoul(str, &end, radix); 796 if (end != str + n) return false; // Leftover junk 797 if (errno) return false; 798 if (dest == NULL) return true; 799 *(reinterpret_cast<unsigned long*>(dest)) = r; 800 return true; 801 } 802 803 bool PCRE::Arg::parse_short_radix(const char* str, 804 int n, 805 void* dest, 806 int radix) { 807 long r; 808 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 809 if ((short)r != r) return false; // Out of range 810 if (dest == NULL) return true; 811 *(reinterpret_cast<short*>(dest)) = r; 812 return true; 813 } 814 815 bool PCRE::Arg::parse_ushort_radix(const char* str, 816 int n, 817 void* dest, 818 int radix) { 819 unsigned long r; 820 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 821 if ((ushort)r != r) return false; // Out of range 822 if (dest == NULL) return true; 823 *(reinterpret_cast<unsigned short*>(dest)) = r; 824 return true; 825 } 826 827 bool PCRE::Arg::parse_int_radix(const char* str, 828 int n, 829 void* dest, 830 int radix) { 831 long r; 832 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 833 if ((int)r != r) return false; // Out of range 834 if (dest == NULL) return true; 835 *(reinterpret_cast<int*>(dest)) = r; 836 return true; 837 } 838 839 bool PCRE::Arg::parse_uint_radix(const char* str, 840 int n, 841 void* dest, 842 int radix) { 843 unsigned long r; 844 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 845 if ((uint)r != r) return false; // Out of range 846 if (dest == NULL) return true; 847 *(reinterpret_cast<unsigned int*>(dest)) = r; 848 return true; 849 } 850 851 bool PCRE::Arg::parse_longlong_radix(const char* str, 852 int n, 853 void* dest, 854 int radix) { 855 if (n == 0) return false; 856 char buf[kMaxNumberLength+1]; 857 str = TerminateNumber(buf, str, n); 858 char* end; 859 errno = 0; 860 int64 r = strtoll(str, &end, radix); 861 if (end != str + n) return false; // Leftover junk 862 if (errno) return false; 863 if (dest == NULL) return true; 864 *(reinterpret_cast<int64*>(dest)) = r; 865 return true; 866 } 867 868 bool PCRE::Arg::parse_ulonglong_radix(const char* str, 869 int n, 870 void* dest, 871 int radix) { 872 if (n == 0) return false; 873 char buf[kMaxNumberLength+1]; 874 str = TerminateNumber(buf, str, n); 875 if (str[0] == '-') { 876 // strtoull() will silently accept negative numbers and parse 877 // them. This module is more strict and treats them as errors. 878 return false; 879 } 880 char* end; 881 errno = 0; 882 uint64 r = strtoull(str, &end, radix); 883 if (end != str + n) return false; // Leftover junk 884 if (errno) return false; 885 if (dest == NULL) return true; 886 *(reinterpret_cast<uint64*>(dest)) = r; 887 return true; 888 } 889 890 bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { 891 if (n == 0) return false; 892 static const int kMaxLength = 200; 893 char buf[kMaxLength]; 894 if (n >= kMaxLength) return false; 895 memcpy(buf, str, n); 896 buf[n] = '\0'; 897 errno = 0; 898 char* end; 899 double r = strtod(buf, &end); 900 if (end != buf + n) { 901 #ifdef COMPILER_MSVC 902 // Microsoft's strtod() doesn't handle inf and nan, so we have to 903 // handle it explicitly. Speed is not important here because this 904 // code is only called in unit tests. 905 bool pos = true; 906 const char* i = buf; 907 if ('-' == *i) { 908 pos = false; 909 ++i; 910 } else if ('+' == *i) { 911 ++i; 912 } 913 if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { 914 r = numeric_limits<double>::infinity(); 915 if (!pos) 916 r = -r; 917 } else if (0 == stricmp(i, "nan")) { 918 r = numeric_limits<double>::quiet_NaN(); 919 } else { 920 return false; 921 } 922 #else 923 return false; // Leftover junk 924 #endif 925 } 926 if (errno) return false; 927 if (dest == NULL) return true; 928 *(reinterpret_cast<double*>(dest)) = r; 929 return true; 930 } 931 932 bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { 933 double r; 934 if (!parse_double(str, n, &r)) return false; 935 if (dest == NULL) return true; 936 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); 937 return true; 938 } 939 940 941 #define DEFINE_INTEGER_PARSERS(name) \ 942 bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ 943 return parse_##name##_radix(str, n, dest, 10); \ 944 } \ 945 bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 946 return parse_##name##_radix(str, n, dest, 16); \ 947 } \ 948 bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 949 return parse_##name##_radix(str, n, dest, 8); \ 950 } \ 951 bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 952 return parse_##name##_radix(str, n, dest, 0); \ 953 } 954 955 DEFINE_INTEGER_PARSERS(short); 956 DEFINE_INTEGER_PARSERS(ushort); 957 DEFINE_INTEGER_PARSERS(int); 958 DEFINE_INTEGER_PARSERS(uint); 959 DEFINE_INTEGER_PARSERS(long); 960 DEFINE_INTEGER_PARSERS(ulong); 961 DEFINE_INTEGER_PARSERS(longlong); 962 DEFINE_INTEGER_PARSERS(ulonglong); 963 964 #undef DEFINE_INTEGER_PARSERS 965 966 } // namespace re2 967