1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression interface RE2. 6 // 7 // Originally the PCRE C++ wrapper, but adapted to use 8 // the new automata-based regular expression engines. 9 10 #include "re2/re2.h" 11 12 #include <stdio.h> 13 #include <ctype.h> 14 #include <string> 15 #include <pthread.h> 16 #include <errno.h> 17 #include "util/util.h" 18 #include "util/flags.h" 19 #include "re2/prog.h" 20 #include "re2/regexp.h" 21 22 DEFINE_bool(trace_re2, false, "trace RE2 execution"); 23 24 namespace re2 { 25 26 // Maximum number of args we can set 27 static const int kMaxArgs = 16; 28 static const int kVecSize = 1+kMaxArgs; 29 30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch; 31 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch; 32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume; 33 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume; 34 35 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h 36 37 // Commonly-used option sets; arguments to constructor are: 38 // utf8 input 39 // posix syntax 40 // longest match 41 // log errors 42 const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true 43 const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true); 44 const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true); 45 const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false); 46 47 // If a regular expression has no error, its error_ field points here 48 static const string empty_string; 49 50 // Converts from Regexp error code to RE2 error code. 51 // Maybe some day they will diverge. In any event, this 52 // hides the existence of Regexp from RE2 users. 53 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { 54 switch (code) { 55 case re2::kRegexpSuccess: 56 return RE2::NoError; 57 case re2::kRegexpInternalError: 58 return RE2::ErrorInternal; 59 case re2::kRegexpBadEscape: 60 return RE2::ErrorBadEscape; 61 case re2::kRegexpBadCharClass: 62 return RE2::ErrorBadCharClass; 63 case re2::kRegexpBadCharRange: 64 return RE2::ErrorBadCharRange; 65 case re2::kRegexpMissingBracket: 66 return RE2::ErrorMissingBracket; 67 case re2::kRegexpMissingParen: 68 return RE2::ErrorMissingParen; 69 case re2::kRegexpTrailingBackslash: 70 return RE2::ErrorTrailingBackslash; 71 case re2::kRegexpRepeatArgument: 72 return RE2::ErrorRepeatArgument; 73 case re2::kRegexpRepeatSize: 74 return RE2::ErrorRepeatSize; 75 case re2::kRegexpRepeatOp: 76 return RE2::ErrorRepeatOp; 77 case re2::kRegexpBadPerlOp: 78 return RE2::ErrorBadPerlOp; 79 case re2::kRegexpBadUTF8: 80 return RE2::ErrorBadUTF8; 81 case re2::kRegexpBadNamedCapture: 82 return RE2::ErrorBadNamedCapture; 83 } 84 return RE2::ErrorInternal; 85 } 86 87 static string trunc(const StringPiece& pattern) { 88 if (pattern.size() < 100) 89 return pattern.as_string(); 90 return pattern.substr(0, 100).as_string() + "..."; 91 } 92 93 94 RE2::RE2(const char* pattern) { 95 Init(pattern, DefaultOptions); 96 } 97 98 RE2::RE2(const string& pattern) { 99 Init(pattern, DefaultOptions); 100 } 101 102 RE2::RE2(const StringPiece& pattern) { 103 Init(pattern, DefaultOptions); 104 } 105 106 RE2::RE2(const StringPiece& pattern, const Options& options) { 107 Init(pattern, options); 108 } 109 110 int RE2::Options::ParseFlags() const { 111 int flags = Regexp::ClassNL; 112 switch (encoding()) { 113 default: 114 LOG(ERROR) << "Unknown encoding " << encoding(); 115 break; 116 case RE2::Options::EncodingUTF8: 117 break; 118 case RE2::Options::EncodingLatin1: 119 flags |= Regexp::Latin1; 120 break; 121 } 122 123 if (!posix_syntax()) 124 flags |= Regexp::LikePerl; 125 126 if (literal()) 127 flags |= Regexp::Literal; 128 129 if (never_nl()) 130 flags |= Regexp::NeverNL; 131 132 if (!case_sensitive()) 133 flags |= Regexp::FoldCase; 134 135 if (perl_classes()) 136 flags |= Regexp::PerlClasses; 137 138 if (word_boundary()) 139 flags |= Regexp::PerlB; 140 141 if (one_line()) 142 flags |= Regexp::OneLine; 143 144 return flags; 145 } 146 147 void RE2::Init(const StringPiece& pattern, const Options& options) { 148 mutex_ = new Mutex; 149 pattern_ = pattern.as_string(); 150 options_.Copy(options); 151 error_ = &empty_string; 152 error_code_ = NoError; 153 suffix_regexp_ = NULL; 154 entire_regexp_ = NULL; 155 prog_ = NULL; 156 rprog_ = NULL; 157 named_groups_ = NULL; 158 group_names_ = NULL; 159 num_captures_ = -1; 160 161 RegexpStatus status; 162 entire_regexp_ = Regexp::Parse( 163 pattern_, 164 static_cast<Regexp::ParseFlags>(options_.ParseFlags()), 165 &status); 166 if (entire_regexp_ == NULL) { 167 if (error_ == &empty_string) 168 error_ = new string(status.Text()); 169 if (options_.log_errors()) { 170 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " 171 << status.Text(); 172 } 173 error_arg_ = status.error_arg().as_string(); 174 error_code_ = RegexpErrorToRE2(status.code()); 175 return; 176 } 177 178 prefix_.clear(); 179 prefix_foldcase_ = false; 180 re2::Regexp* suffix; 181 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) 182 suffix_regexp_ = suffix; 183 else 184 suffix_regexp_ = entire_regexp_->Incref(); 185 186 // Two thirds of the memory goes to the forward Prog, 187 // one third to the reverse prog, because the forward 188 // Prog has two DFAs but the reverse prog has one. 189 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); 190 if (prog_ == NULL) { 191 if (options_.log_errors()) 192 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; 193 error_ = new string("pattern too large - compile failed"); 194 error_code_ = RE2::ErrorPatternTooLarge; 195 return; 196 } 197 198 // Could delay this until the first match call that 199 // cares about submatch information, but the one-pass 200 // machine's memory gets cut from the DFA memory budget, 201 // and that is harder to do if the DFA has already 202 // been built. 203 is_one_pass_ = prog_->IsOnePass(); 204 } 205 206 // Returns rprog_, computing it if needed. 207 re2::Prog* RE2::ReverseProg() const { 208 MutexLock l(mutex_); 209 if (rprog_ == NULL && error_ == &empty_string) { 210 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); 211 if (rprog_ == NULL) { 212 if (options_.log_errors()) 213 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; 214 error_ = new string("pattern too large - reverse compile failed"); 215 error_code_ = RE2::ErrorPatternTooLarge; 216 return NULL; 217 } 218 } 219 return rprog_; 220 } 221 222 static const map<string, int> empty_named_groups; 223 static const map<int, string> empty_group_names; 224 225 RE2::~RE2() { 226 if (suffix_regexp_) 227 suffix_regexp_->Decref(); 228 if (entire_regexp_) 229 entire_regexp_->Decref(); 230 delete mutex_; 231 delete prog_; 232 delete rprog_; 233 if (error_ != &empty_string) 234 delete error_; 235 if (named_groups_ != NULL && named_groups_ != &empty_named_groups) 236 delete named_groups_; 237 if (group_names_ != NULL && group_names_ != &empty_group_names) 238 delete group_names_; 239 } 240 241 int RE2::ProgramSize() const { 242 if (prog_ == NULL) 243 return -1; 244 return prog_->size(); 245 } 246 247 // Returns named_groups_, computing it if needed. 248 const map<string, int>& RE2::NamedCapturingGroups() const { 249 MutexLock l(mutex_); 250 if (!ok()) 251 return empty_named_groups; 252 if (named_groups_ == NULL) { 253 named_groups_ = suffix_regexp_->NamedCaptures(); 254 if (named_groups_ == NULL) 255 named_groups_ = &empty_named_groups; 256 } 257 return *named_groups_; 258 } 259 260 // Returns group_names_, computing it if needed. 261 const map<int, string>& RE2::CapturingGroupNames() const { 262 MutexLock l(mutex_); 263 if (!ok()) 264 return empty_group_names; 265 if (group_names_ == NULL) { 266 group_names_ = suffix_regexp_->CaptureNames(); 267 if (group_names_ == NULL) 268 group_names_ = &empty_group_names; 269 } 270 return *group_names_; 271 } 272 273 /***** Convenience interfaces *****/ 274 275 bool RE2::FullMatchN(const StringPiece& text, const RE2& re, 276 const Arg* const args[], int n) { 277 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); 278 } 279 280 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, 281 const Arg* const args[], int n) { 282 return re.DoMatch(text, UNANCHORED, NULL, args, n); 283 } 284 285 bool RE2::ConsumeN(StringPiece* input, const RE2& re, 286 const Arg* const args[], int n) { 287 int consumed; 288 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { 289 input->remove_prefix(consumed); 290 return true; 291 } else { 292 return false; 293 } 294 } 295 296 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, 297 const Arg* const args[], int n) { 298 int consumed; 299 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { 300 input->remove_prefix(consumed); 301 return true; 302 } else { 303 return false; 304 } 305 } 306 307 // Returns the maximum submatch needed for the rewrite to be done by Replace(). 308 // E.g. if rewrite == "foo \\2,\\1", returns 2. 309 static int MaxSubmatch(const StringPiece& rewrite) { 310 int max = 0; 311 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 312 s < end; s++) { 313 if (*s == '\\') { 314 s++; 315 int c = (s < end) ? *s : -1; 316 if (isdigit(c)) { 317 int n = (c - '0'); 318 if (n > max) 319 max = n; 320 } 321 } 322 } 323 return max; 324 } 325 326 bool RE2::Replace(string *str, 327 const RE2& re, 328 const StringPiece& rewrite) { 329 StringPiece vec[kVecSize]; 330 int nvec = 1 + MaxSubmatch(rewrite); 331 if (nvec > arraysize(vec)) 332 return false; 333 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) 334 return false; 335 336 string s; 337 if (!re.Rewrite(&s, rewrite, vec, nvec)) 338 return false; 339 340 assert(vec[0].begin() >= str->data()); 341 assert(vec[0].end() <= str->data()+str->size()); 342 str->replace(vec[0].data() - str->data(), vec[0].size(), s); 343 return true; 344 } 345 346 int RE2::GlobalReplace(string *str, 347 const RE2& re, 348 const StringPiece& rewrite) { 349 StringPiece vec[kVecSize]; 350 int nvec = 1 + MaxSubmatch(rewrite); 351 if (nvec > arraysize(vec)) 352 return false; 353 354 const char* p = str->data(); 355 const char* ep = p + str->size(); 356 const char* lastend = NULL; 357 string out; 358 int count = 0; 359 while (p <= ep) { 360 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) 361 break; 362 if (p < vec[0].begin()) 363 out.append(p, vec[0].begin() - p); 364 if (vec[0].begin() == lastend && vec[0].size() == 0) { 365 // Disallow empty match at end of last match: skip ahead. 366 if (p < ep) 367 out.append(p, 1); 368 p++; 369 continue; 370 } 371 re.Rewrite(&out, rewrite, vec, nvec); 372 p = vec[0].end(); 373 lastend = p; 374 count++; 375 } 376 377 if (count == 0) 378 return 0; 379 380 if (p < ep) 381 out.append(p, ep - p); 382 swap(out, *str); 383 return count; 384 } 385 386 bool RE2::Extract(const StringPiece &text, 387 const RE2& re, 388 const StringPiece &rewrite, 389 string *out) { 390 StringPiece vec[kVecSize]; 391 int nvec = 1 + MaxSubmatch(rewrite); 392 if (nvec > arraysize(vec)) 393 return false; 394 395 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) 396 return false; 397 398 out->clear(); 399 return re.Rewrite(out, rewrite, vec, nvec); 400 } 401 402 string RE2::QuoteMeta(const StringPiece& unquoted) { 403 string result; 404 result.reserve(unquoted.size() << 1); 405 406 // Escape any ascii character not in [A-Za-z_0-9]. 407 // 408 // Note that it's legal to escape a character even if it has no 409 // special meaning in a regular expression -- so this function does 410 // that. (This also makes it identical to the perl function of the 411 // same name except for the null-character special case; 412 // see `perldoc -f quotemeta`.) 413 for (int ii = 0; ii < unquoted.length(); ++ii) { 414 // Note that using 'isalnum' here raises the benchmark time from 415 // 32ns to 58ns: 416 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 417 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 418 (unquoted[ii] < '0' || unquoted[ii] > '9') && 419 unquoted[ii] != '_' && 420 // If this is the part of a UTF8 or Latin1 character, we need 421 // to copy this byte without escaping. Experimentally this is 422 // what works correctly with the regexp library. 423 !(unquoted[ii] & 128)) { 424 if (unquoted[ii] == '\0') { // Special handling for null chars. 425 // Note that this special handling is not strictly required for RE2, 426 // but this quoting is required for other regexp libraries such as 427 // PCRE. 428 // Can't use "\\0" since the next character might be a digit. 429 result += "\\x00"; 430 continue; 431 } 432 result += '\\'; 433 } 434 result += unquoted[ii]; 435 } 436 437 return result; 438 } 439 440 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { 441 if (prog_ == NULL) 442 return false; 443 444 int n = prefix_.size(); 445 if (n > maxlen) 446 n = maxlen; 447 448 // Determine initial min max from prefix_ literal. 449 string pmin, pmax; 450 pmin = prefix_.substr(0, n); 451 pmax = prefix_.substr(0, n); 452 if (prefix_foldcase_) { 453 // prefix is ASCII lowercase; change pmin to uppercase. 454 for (int i = 0; i < n; i++) { 455 if ('a' <= pmin[i] && pmin[i] <= 'z') 456 pmin[i] += 'A' - 'a'; 457 } 458 } 459 460 // Add to prefix min max using PossibleMatchRange on regexp. 461 string dmin, dmax; 462 maxlen -= n; 463 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { 464 pmin += dmin; 465 pmax += dmax; 466 } else if (pmax.size() > 0) { 467 // prog_->PossibleMatchRange has failed us, 468 // but we still have useful information from prefix_. 469 // Round up pmax to allow any possible suffix. 470 pmax = PrefixSuccessor(pmax); 471 } else { 472 // Nothing useful. 473 *min = ""; 474 *max = ""; 475 return false; 476 } 477 478 *min = pmin; 479 *max = pmax; 480 return true; 481 } 482 483 // Avoid possible locale nonsense in standard strcasecmp. 484 // The string a is known to be all lowercase. 485 static int ascii_strcasecmp(const char* a, const char* b, int len) { 486 const char *ae = a + len; 487 488 for (; a < ae; a++, b++) { 489 uint8 x = *a; 490 uint8 y = *b; 491 if ('A' <= y && y <= 'Z') 492 y += 'a' - 'A'; 493 if (x != y) 494 return x - y; 495 } 496 return 0; 497 } 498 499 500 /***** Actual matching and rewriting code *****/ 501 502 bool RE2::Match(const StringPiece& text, 503 int startpos, 504 int endpos, 505 Anchor re_anchor, 506 StringPiece* submatch, 507 int nsubmatch) const { 508 if (!ok() || suffix_regexp_ == NULL) { 509 if (options_.log_errors()) 510 LOG(ERROR) << "Invalid RE2: " << *error_; 511 return false; 512 } 513 514 if (startpos < 0 || startpos > endpos || endpos > text.size()) { 515 LOG(ERROR) << "RE2: invalid startpos, endpos pair."; 516 return false; 517 } 518 519 StringPiece subtext = text; 520 subtext.remove_prefix(startpos); 521 subtext.remove_suffix(text.size() - endpos); 522 523 // Use DFAs to find exact location of match, filter out non-matches. 524 525 // Don't ask for the location if we won't use it. 526 // SearchDFA can do extra optimizations in that case. 527 StringPiece match; 528 StringPiece* matchp = &match; 529 if (nsubmatch == 0) 530 matchp = NULL; 531 532 int ncap = 1 + NumberOfCapturingGroups(); 533 if (ncap > nsubmatch) 534 ncap = nsubmatch; 535 536 // If the regexp is anchored explicitly, must not be in middle of text. 537 if (prog_->anchor_start() && startpos != 0) 538 return false; 539 540 // If the regexp is anchored explicitly, update re_anchor 541 // so that we can potentially fall into a faster case below. 542 if (prog_->anchor_start() && prog_->anchor_end()) 543 re_anchor = ANCHOR_BOTH; 544 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) 545 re_anchor = ANCHOR_START; 546 547 // Check for the required prefix, if any. 548 int prefixlen = 0; 549 if (!prefix_.empty()) { 550 if (startpos != 0) 551 return false; 552 prefixlen = prefix_.size(); 553 if (prefixlen > subtext.size()) 554 return false; 555 if (prefix_foldcase_) { 556 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) 557 return false; 558 } else { 559 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) 560 return false; 561 } 562 subtext.remove_prefix(prefixlen); 563 // If there is a required prefix, the anchor must be at least ANCHOR_START. 564 if (re_anchor != ANCHOR_BOTH) 565 re_anchor = ANCHOR_START; 566 } 567 568 Prog::Anchor anchor = Prog::kUnanchored; 569 Prog::MatchKind kind = Prog::kFirstMatch; 570 if (options_.longest_match()) 571 kind = Prog::kLongestMatch; 572 bool skipped_test = false; 573 574 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); 575 576 // SearchBitState allocates a bit vector of size prog_->size() * text.size(). 577 // It also allocates a stack of 3-word structures which could potentially 578 // grow as large as prog_->size() * text.size() but in practice is much 579 // smaller. 580 // Conditions for using SearchBitState: 581 const int MaxBitStateProg = 500; // prog_->size() <= Max. 582 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) 583 bool can_bit_state = prog_->size() <= MaxBitStateProg; 584 int bit_state_text_max = MaxBitStateVector / prog_->size(); 585 586 bool dfa_failed = false; 587 switch (re_anchor) { 588 default: 589 case UNANCHORED: { 590 if (!prog_->SearchDFA(subtext, text, anchor, kind, 591 matchp, &dfa_failed, NULL)) { 592 if (dfa_failed) { 593 // Fall back to NFA below. 594 skipped_test = true; 595 if (FLAGS_trace_re2) 596 LOG(INFO) << "Match " << trunc(pattern_) 597 << " [" << CEscape(subtext) << "]" 598 << " DFA failed."; 599 break; 600 } 601 if (FLAGS_trace_re2) 602 LOG(INFO) << "Match " << trunc(pattern_) 603 << " [" << CEscape(subtext) << "]" 604 << " used DFA - no match."; 605 return false; 606 } 607 if (FLAGS_trace_re2) 608 LOG(INFO) << "Match " << trunc(pattern_) 609 << " [" << CEscape(subtext) << "]" 610 << " used DFA - match"; 611 if (matchp == NULL) // Matched. Don't care where 612 return true; 613 // SearchDFA set match[0].end() but didn't know where the 614 // match started. Run the regexp backward from match[0].end() 615 // to find the longest possible match -- that's where it started. 616 Prog* prog = ReverseProg(); 617 if (prog == NULL) 618 return false; 619 if (!prog->SearchDFA(match, text, Prog::kAnchored, 620 Prog::kLongestMatch, &match, &dfa_failed, NULL)) { 621 if (dfa_failed) { 622 // Fall back to NFA below. 623 skipped_test = true; 624 if (FLAGS_trace_re2) 625 LOG(INFO) << "Match " << trunc(pattern_) 626 << " [" << CEscape(subtext) << "]" 627 << " reverse DFA failed."; 628 break; 629 } 630 if (FLAGS_trace_re2) 631 LOG(INFO) << "Match " << trunc(pattern_) 632 << " [" << CEscape(subtext) << "]" 633 << " DFA inconsistency."; 634 LOG(ERROR) << "DFA inconsistency"; 635 return false; 636 } 637 if (FLAGS_trace_re2) 638 LOG(INFO) << "Match " << trunc(pattern_) 639 << " [" << CEscape(subtext) << "]" 640 << " used reverse DFA."; 641 break; 642 } 643 644 case ANCHOR_BOTH: 645 case ANCHOR_START: 646 if (re_anchor == ANCHOR_BOTH) 647 kind = Prog::kFullMatch; 648 anchor = Prog::kAnchored; 649 650 // If only a small amount of text and need submatch 651 // information anyway and we're going to use OnePass or BitState 652 // to get it, we might as well not even bother with the DFA: 653 // OnePass or BitState will be fast enough. 654 // On tiny texts, OnePass outruns even the DFA, and 655 // it doesn't have the shared state and occasional mutex that 656 // the DFA does. 657 if (can_one_pass && text.size() <= 4096 && 658 (ncap > 1 || text.size() <= 8)) { 659 if (FLAGS_trace_re2) 660 LOG(INFO) << "Match " << trunc(pattern_) 661 << " [" << CEscape(subtext) << "]" 662 << " skipping DFA for OnePass."; 663 skipped_test = true; 664 break; 665 } 666 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { 667 if (FLAGS_trace_re2) 668 LOG(INFO) << "Match " << trunc(pattern_) 669 << " [" << CEscape(subtext) << "]" 670 << " skipping DFA for BitState."; 671 skipped_test = true; 672 break; 673 } 674 if (!prog_->SearchDFA(subtext, text, anchor, kind, 675 &match, &dfa_failed, NULL)) { 676 if (dfa_failed) { 677 if (FLAGS_trace_re2) 678 LOG(INFO) << "Match " << trunc(pattern_) 679 << " [" << CEscape(subtext) << "]" 680 << " DFA failed."; 681 skipped_test = true; 682 break; 683 } 684 if (FLAGS_trace_re2) 685 LOG(INFO) << "Match " << trunc(pattern_) 686 << " [" << CEscape(subtext) << "]" 687 << " used DFA - no match."; 688 return false; 689 } 690 break; 691 } 692 693 if (!skipped_test && ncap <= 1) { 694 // We know exactly where it matches. That's enough. 695 if (ncap == 1) 696 submatch[0] = match; 697 } else { 698 StringPiece subtext1; 699 if (skipped_test) { 700 // DFA ran out of memory or was skipped: 701 // need to search in entire original text. 702 subtext1 = subtext; 703 } else { 704 // DFA found the exact match location: 705 // let NFA run an anchored, full match search 706 // to find submatch locations. 707 subtext1 = match; 708 anchor = Prog::kAnchored; 709 kind = Prog::kFullMatch; 710 } 711 712 if (can_one_pass && anchor != Prog::kUnanchored) { 713 if (FLAGS_trace_re2) 714 LOG(INFO) << "Match " << trunc(pattern_) 715 << " [" << CEscape(subtext) << "]" 716 << " using OnePass."; 717 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { 718 if (!skipped_test) 719 LOG(ERROR) << "SearchOnePass inconsistency"; 720 return false; 721 } 722 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { 723 if (FLAGS_trace_re2) 724 LOG(INFO) << "Match " << trunc(pattern_) 725 << " [" << CEscape(subtext) << "]" 726 << " using BitState."; 727 if (!prog_->SearchBitState(subtext1, text, anchor, 728 kind, submatch, ncap)) { 729 if (!skipped_test) 730 LOG(ERROR) << "SearchBitState inconsistency"; 731 return false; 732 } 733 } else { 734 if (FLAGS_trace_re2) 735 LOG(INFO) << "Match " << trunc(pattern_) 736 << " [" << CEscape(subtext) << "]" 737 << " using NFA."; 738 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { 739 if (!skipped_test) 740 LOG(ERROR) << "SearchNFA inconsistency"; 741 return false; 742 } 743 } 744 } 745 746 // Adjust overall match for required prefix that we stripped off. 747 if (prefixlen > 0 && nsubmatch > 0) 748 submatch[0] = StringPiece(submatch[0].begin() - prefixlen, 749 submatch[0].size() + prefixlen); 750 751 // Zero submatches that don't exist in the regexp. 752 for (int i = ncap; i < nsubmatch; i++) 753 submatch[i] = NULL; 754 return true; 755 } 756 757 // Internal matcher - like Match() but takes Args not StringPieces. 758 bool RE2::DoMatch(const StringPiece& text, 759 Anchor anchor, 760 int* consumed, 761 const Arg* const* args, 762 int n) const { 763 if (!ok()) { 764 if (options_.log_errors()) 765 LOG(ERROR) << "Invalid RE2: " << *error_; 766 return false; 767 } 768 769 // Count number of capture groups needed. 770 int nvec; 771 if (n == 0 && consumed == NULL) 772 nvec = 0; 773 else 774 nvec = n+1; 775 776 StringPiece* vec; 777 StringPiece stkvec[kVecSize]; 778 StringPiece* heapvec = NULL; 779 780 if (nvec <= arraysize(stkvec)) { 781 vec = stkvec; 782 } else { 783 vec = new StringPiece[nvec]; 784 heapvec = vec; 785 } 786 787 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { 788 delete[] heapvec; 789 return false; 790 } 791 792 if(consumed != NULL) 793 *consumed = vec[0].end() - text.begin(); 794 795 if (n == 0 || args == NULL) { 796 // We are not interested in results 797 delete[] heapvec; 798 return true; 799 } 800 801 int ncap = NumberOfCapturingGroups(); 802 if (ncap < n) { 803 // RE has fewer capturing groups than number of arg pointers passed in 804 VLOG(1) << "Asked for " << n << " but only have " << ncap; 805 delete[] heapvec; 806 return false; 807 } 808 809 // If we got here, we must have matched the whole pattern. 810 for (int i = 0; i < n; i++) { 811 const StringPiece& s = vec[i+1]; 812 if (!args[i]->Parse(s.data(), s.size())) { 813 // TODO: Should we indicate what the error was? 814 VLOG(1) << "Parse error on #" << i << " " << s << " " 815 << (void*)s.data() << "/" << s.size(); 816 delete[] heapvec; 817 return false; 818 } 819 } 820 821 delete[] heapvec; 822 return true; 823 } 824 825 // Append the "rewrite" string, with backslash subsitutions from "vec", 826 // to string "out". 827 bool RE2::Rewrite(string *out, const StringPiece &rewrite, 828 const StringPiece *vec, int veclen) const { 829 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 830 s < end; s++) { 831 int c = *s; 832 if (c == '\\') { 833 s++; 834 c = (s < end) ? *s : -1; 835 if (isdigit(c)) { 836 int n = (c - '0'); 837 if (n >= veclen) { 838 LOG(ERROR) << "requested group " << n 839 << " in regexp " << rewrite.data(); 840 return false; 841 } 842 StringPiece snip = vec[n]; 843 if (snip.size() > 0) 844 out->append(snip.data(), snip.size()); 845 } else if (c == '\\') { 846 out->push_back('\\'); 847 } else { 848 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); 849 return false; 850 } 851 } else { 852 out->push_back(c); 853 } 854 } 855 return true; 856 } 857 858 // Return the number of capturing subpatterns, or -1 if the 859 // regexp wasn't valid on construction. 860 int RE2::NumberOfCapturingGroups() const { 861 if (suffix_regexp_ == NULL) 862 return -1; 863 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" 864 " multiple threads end up doing the same work in parallel."); 865 if (num_captures_ == -1) 866 num_captures_ = suffix_regexp_->NumCaptures(); 867 return num_captures_; 868 } 869 870 // Checks that the rewrite string is well-formed with respect to this 871 // regular expression. 872 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { 873 int max_token = -1; 874 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 875 s < end; s++) { 876 int c = *s; 877 if (c != '\\') { 878 continue; 879 } 880 if (++s == end) { 881 *error = "Rewrite schema error: '\\' not allowed at end."; 882 return false; 883 } 884 c = *s; 885 if (c == '\\') { 886 continue; 887 } 888 if (!isdigit(c)) { 889 *error = "Rewrite schema error: " 890 "'\\' must be followed by a digit or '\\'."; 891 return false; 892 } 893 int n = (c - '0'); 894 if (max_token < n) { 895 max_token = n; 896 } 897 } 898 899 if (max_token > NumberOfCapturingGroups()) { 900 SStringPrintf(error, "Rewrite schema requests %d matches, " 901 "but the regexp only has %d parenthesized subexpressions.", 902 max_token, NumberOfCapturingGroups()); 903 return false; 904 } 905 return true; 906 } 907 908 /***** Parsers for various types *****/ 909 910 bool RE2::Arg::parse_null(const char* str, int n, void* dest) { 911 // We fail if somebody asked us to store into a non-NULL void* pointer 912 return (dest == NULL); 913 } 914 915 bool RE2::Arg::parse_string(const char* str, int n, void* dest) { 916 if (dest == NULL) return true; 917 reinterpret_cast<string*>(dest)->assign(str, n); 918 return true; 919 } 920 921 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { 922 if (dest == NULL) return true; 923 reinterpret_cast<StringPiece*>(dest)->set(str, n); 924 return true; 925 } 926 927 bool RE2::Arg::parse_char(const char* str, int n, void* dest) { 928 if (n != 1) return false; 929 if (dest == NULL) return true; 930 *(reinterpret_cast<char*>(dest)) = str[0]; 931 return true; 932 } 933 934 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { 935 if (n != 1) return false; 936 if (dest == NULL) return true; 937 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 938 return true; 939 } 940 941 // Largest number spec that we are willing to parse 942 static const int kMaxNumberLength = 32; 943 944 // REQUIRES "buf" must have length at least kMaxNumberLength+1 945 // Copies "str" into "buf" and null-terminates. 946 // Overwrites *np with the new length. 947 static const char* TerminateNumber(char* buf, const char* str, int* np) { 948 int n = *np; 949 if (n <= 0) return ""; 950 if (n > 0 && isspace(*str)) { 951 // We are less forgiving than the strtoxxx() routines and do not 952 // allow leading spaces. 953 return ""; 954 } 955 956 // Although buf has a fixed maximum size, we can still handle 957 // arbitrarily large integers correctly by omitting leading zeros. 958 // (Numbers that are still too long will be out of range.) 959 // Before deciding whether str is too long, 960 // remove leading zeros with s/000+/00/. 961 // Leaving the leading two zeros in place means that 962 // we don't change 0000x123 (invalid) into 0x123 (valid). 963 // Skip over leading - before replacing. 964 bool neg = false; 965 if (n >= 1 && str[0] == '-') { 966 neg = true; 967 n--; 968 str++; 969 } 970 971 if (n >= 3 && str[0] == '0' && str[1] == '0') { 972 while (n >= 3 && str[2] == '0') { 973 n--; 974 str++; 975 } 976 } 977 978 if (neg) { // make room in buf for - 979 n++; 980 str--; 981 } 982 983 if (n > kMaxNumberLength) return ""; 984 985 memmove(buf, str, n); 986 if (neg) { 987 buf[0] = '-'; 988 } 989 buf[n] = '\0'; 990 *np = n; 991 return buf; 992 } 993 994 bool RE2::Arg::parse_long_radix(const char* str, 995 int n, 996 void* dest, 997 int radix) { 998 if (n == 0) return false; 999 char buf[kMaxNumberLength+1]; 1000 str = TerminateNumber(buf, str, &n); 1001 char* end; 1002 errno = 0; 1003 long r = strtol(str, &end, radix); 1004 if (end != str + n) return false; // Leftover junk 1005 if (errno) return false; 1006 if (dest == NULL) return true; 1007 *(reinterpret_cast<long*>(dest)) = r; 1008 return true; 1009 } 1010 1011 bool RE2::Arg::parse_ulong_radix(const char* str, 1012 int n, 1013 void* dest, 1014 int radix) { 1015 if (n == 0) return false; 1016 char buf[kMaxNumberLength+1]; 1017 str = TerminateNumber(buf, str, &n); 1018 if (str[0] == '-') { 1019 // strtoul() will silently accept negative numbers and parse 1020 // them. This module is more strict and treats them as errors. 1021 return false; 1022 } 1023 1024 char* end; 1025 errno = 0; 1026 unsigned long r = strtoul(str, &end, radix); 1027 if (end != str + n) return false; // Leftover junk 1028 if (errno) return false; 1029 if (dest == NULL) return true; 1030 *(reinterpret_cast<unsigned long*>(dest)) = r; 1031 return true; 1032 } 1033 1034 bool RE2::Arg::parse_short_radix(const char* str, 1035 int n, 1036 void* dest, 1037 int radix) { 1038 long r; 1039 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1040 if ((short)r != r) return false; // Out of range 1041 if (dest == NULL) return true; 1042 *(reinterpret_cast<short*>(dest)) = r; 1043 return true; 1044 } 1045 1046 bool RE2::Arg::parse_ushort_radix(const char* str, 1047 int n, 1048 void* dest, 1049 int radix) { 1050 unsigned long r; 1051 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1052 if ((ushort)r != r) return false; // Out of range 1053 if (dest == NULL) return true; 1054 *(reinterpret_cast<unsigned short*>(dest)) = r; 1055 return true; 1056 } 1057 1058 bool RE2::Arg::parse_int_radix(const char* str, 1059 int n, 1060 void* dest, 1061 int radix) { 1062 long r; 1063 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1064 if ((int)r != r) return false; // Out of range 1065 if (dest == NULL) return true; 1066 *(reinterpret_cast<int*>(dest)) = r; 1067 return true; 1068 } 1069 1070 bool RE2::Arg::parse_uint_radix(const char* str, 1071 int n, 1072 void* dest, 1073 int radix) { 1074 unsigned long r; 1075 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1076 if ((uint)r != r) return false; // Out of range 1077 if (dest == NULL) return true; 1078 *(reinterpret_cast<unsigned int*>(dest)) = r; 1079 return true; 1080 } 1081 1082 bool RE2::Arg::parse_longlong_radix(const char* str, 1083 int n, 1084 void* dest, 1085 int radix) { 1086 if (n == 0) return false; 1087 char buf[kMaxNumberLength+1]; 1088 str = TerminateNumber(buf, str, &n); 1089 char* end; 1090 errno = 0; 1091 int64 r = strtoll(str, &end, radix); 1092 if (end != str + n) return false; // Leftover junk 1093 if (errno) return false; 1094 if (dest == NULL) return true; 1095 *(reinterpret_cast<int64*>(dest)) = r; 1096 return true; 1097 } 1098 1099 bool RE2::Arg::parse_ulonglong_radix(const char* str, 1100 int n, 1101 void* dest, 1102 int radix) { 1103 if (n == 0) return false; 1104 char buf[kMaxNumberLength+1]; 1105 str = TerminateNumber(buf, str, &n); 1106 if (str[0] == '-') { 1107 // strtoull() will silently accept negative numbers and parse 1108 // them. This module is more strict and treats them as errors. 1109 return false; 1110 } 1111 char* end; 1112 errno = 0; 1113 uint64 r = strtoull(str, &end, radix); 1114 if (end != str + n) return false; // Leftover junk 1115 if (errno) return false; 1116 if (dest == NULL) return true; 1117 *(reinterpret_cast<uint64*>(dest)) = r; 1118 return true; 1119 } 1120 1121 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { 1122 if (n == 0) return false; 1123 static const int kMaxLength = 200; 1124 char buf[kMaxLength]; 1125 if (n >= kMaxLength) return false; 1126 memcpy(buf, str, n); 1127 buf[n] = '\0'; 1128 errno = 0; 1129 char* end; 1130 double r; 1131 if (isfloat) { 1132 r = strtof(buf, &end); 1133 } else { 1134 r = strtod(buf, &end); 1135 } 1136 if (end != buf + n) return false; // Leftover junk 1137 if (errno) return false; 1138 if (dest == NULL) return true; 1139 if (isfloat) { 1140 *(reinterpret_cast<float*>(dest)) = r; 1141 } else { 1142 *(reinterpret_cast<double*>(dest)) = r; 1143 } 1144 return true; 1145 } 1146 1147 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { 1148 return parse_double_float(str, n, false, dest); 1149 } 1150 1151 bool RE2::Arg::parse_float(const char* str, int n, void* dest) { 1152 return parse_double_float(str, n, true, dest); 1153 } 1154 1155 1156 #define DEFINE_INTEGER_PARSERS(name) \ 1157 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ 1158 return parse_##name##_radix(str, n, dest, 10); \ 1159 } \ 1160 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 1161 return parse_##name##_radix(str, n, dest, 16); \ 1162 } \ 1163 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 1164 return parse_##name##_radix(str, n, dest, 8); \ 1165 } \ 1166 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 1167 return parse_##name##_radix(str, n, dest, 0); \ 1168 } 1169 1170 DEFINE_INTEGER_PARSERS(short); 1171 DEFINE_INTEGER_PARSERS(ushort); 1172 DEFINE_INTEGER_PARSERS(int); 1173 DEFINE_INTEGER_PARSERS(uint); 1174 DEFINE_INTEGER_PARSERS(long); 1175 DEFINE_INTEGER_PARSERS(ulong); 1176 DEFINE_INTEGER_PARSERS(longlong); 1177 DEFINE_INTEGER_PARSERS(ulonglong); 1178 1179 #undef DEFINE_INTEGER_PARSERS 1180 1181 } // namespace re2 1182