1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression interface RE2. 6 // 7 // Originally the PCRE C++ wrapper, but adapted to use 8 // the new automata-based regular expression engines. 9 10 #include "re2/re2.h" 11 12 #include <stdio.h> 13 #include <string> 14 #include <pthread.h> 15 #include <errno.h> 16 #include "util/util.h" 17 #include "util/flags.h" 18 #include "re2/prog.h" 19 #include "re2/regexp.h" 20 21 DEFINE_bool(trace_re2, false, "trace RE2 execution"); 22 23 namespace re2 { 24 25 // Maximum number of args we can set 26 static const int kMaxArgs = 16; 27 static const int kVecSize = 1+kMaxArgs; 28 29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch; 30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch; 31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume; 32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume; 33 34 // This will trigger LNK2005 error in MSVC. 35 #ifndef COMPILER_MSVC 36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h 37 #endif // COMPILER_MSVC 38 39 RE2::Options::Options(RE2::CannedOptions opt) 40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), 41 posix_syntax_(opt == RE2::POSIX), 42 longest_match_(opt == RE2::POSIX), 43 log_errors_(opt != RE2::Quiet), 44 max_mem_(kDefaultMaxMem), 45 literal_(false), 46 never_nl_(false), 47 never_capture_(false), 48 case_sensitive_(true), 49 perl_classes_(false), 50 word_boundary_(false), 51 one_line_(false) { 52 } 53 54 // static empty things for use as const references. 55 // To avoid global constructors, initialized on demand. 56 GLOBAL_MUTEX(empty_mutex); 57 static const string *empty_string; 58 static const map<string, int> *empty_named_groups; 59 static const map<int, string> *empty_group_names; 60 61 static void InitEmpty() { 62 GLOBAL_MUTEX_LOCK(empty_mutex); 63 if (empty_string == NULL) { 64 empty_string = new string; 65 empty_named_groups = new map<string, int>; 66 empty_group_names = new map<int, string>; 67 } 68 GLOBAL_MUTEX_UNLOCK(empty_mutex); 69 } 70 71 // Converts from Regexp error code to RE2 error code. 72 // Maybe some day they will diverge. In any event, this 73 // hides the existence of Regexp from RE2 users. 74 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { 75 switch (code) { 76 case re2::kRegexpSuccess: 77 return RE2::NoError; 78 case re2::kRegexpInternalError: 79 return RE2::ErrorInternal; 80 case re2::kRegexpBadEscape: 81 return RE2::ErrorBadEscape; 82 case re2::kRegexpBadCharClass: 83 return RE2::ErrorBadCharClass; 84 case re2::kRegexpBadCharRange: 85 return RE2::ErrorBadCharRange; 86 case re2::kRegexpMissingBracket: 87 return RE2::ErrorMissingBracket; 88 case re2::kRegexpMissingParen: 89 return RE2::ErrorMissingParen; 90 case re2::kRegexpTrailingBackslash: 91 return RE2::ErrorTrailingBackslash; 92 case re2::kRegexpRepeatArgument: 93 return RE2::ErrorRepeatArgument; 94 case re2::kRegexpRepeatSize: 95 return RE2::ErrorRepeatSize; 96 case re2::kRegexpRepeatOp: 97 return RE2::ErrorRepeatOp; 98 case re2::kRegexpBadPerlOp: 99 return RE2::ErrorBadPerlOp; 100 case re2::kRegexpBadUTF8: 101 return RE2::ErrorBadUTF8; 102 case re2::kRegexpBadNamedCapture: 103 return RE2::ErrorBadNamedCapture; 104 } 105 return RE2::ErrorInternal; 106 } 107 108 static string trunc(const StringPiece& pattern) { 109 if (pattern.size() < 100) 110 return pattern.as_string(); 111 return pattern.substr(0, 100).as_string() + "..."; 112 } 113 114 115 RE2::RE2(const char* pattern) { 116 Init(pattern, DefaultOptions); 117 } 118 119 RE2::RE2(const string& pattern) { 120 Init(pattern, DefaultOptions); 121 } 122 123 RE2::RE2(const StringPiece& pattern) { 124 Init(pattern, DefaultOptions); 125 } 126 127 RE2::RE2(const StringPiece& pattern, const Options& options) { 128 Init(pattern, options); 129 } 130 131 int RE2::Options::ParseFlags() const { 132 int flags = Regexp::ClassNL; 133 switch (encoding()) { 134 default: 135 if (log_errors()) 136 LOG(ERROR) << "Unknown encoding " << encoding(); 137 break; 138 case RE2::Options::EncodingUTF8: 139 break; 140 case RE2::Options::EncodingLatin1: 141 flags |= Regexp::Latin1; 142 break; 143 } 144 145 if (!posix_syntax()) 146 flags |= Regexp::LikePerl; 147 148 if (literal()) 149 flags |= Regexp::Literal; 150 151 if (never_nl()) 152 flags |= Regexp::NeverNL; 153 154 if (never_capture()) 155 flags |= Regexp::NeverCapture; 156 157 if (!case_sensitive()) 158 flags |= Regexp::FoldCase; 159 160 if (perl_classes()) 161 flags |= Regexp::PerlClasses; 162 163 if (word_boundary()) 164 flags |= Regexp::PerlB; 165 166 if (one_line()) 167 flags |= Regexp::OneLine; 168 169 return flags; 170 } 171 172 void RE2::Init(const StringPiece& pattern, const Options& options) { 173 mutex_ = new Mutex; 174 pattern_ = pattern.as_string(); 175 options_.Copy(options); 176 InitEmpty(); 177 error_ = empty_string; 178 error_code_ = NoError; 179 suffix_regexp_ = NULL; 180 entire_regexp_ = NULL; 181 prog_ = NULL; 182 rprog_ = NULL; 183 named_groups_ = NULL; 184 group_names_ = NULL; 185 num_captures_ = -1; 186 187 RegexpStatus status; 188 entire_regexp_ = Regexp::Parse( 189 pattern_, 190 static_cast<Regexp::ParseFlags>(options_.ParseFlags()), 191 &status); 192 if (entire_regexp_ == NULL) { 193 if (error_ == empty_string) 194 error_ = new string(status.Text()); 195 if (options_.log_errors()) { 196 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " 197 << status.Text(); 198 } 199 error_arg_ = status.error_arg().as_string(); 200 error_code_ = RegexpErrorToRE2(status.code()); 201 return; 202 } 203 204 prefix_.clear(); 205 prefix_foldcase_ = false; 206 re2::Regexp* suffix; 207 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) 208 suffix_regexp_ = suffix; 209 else 210 suffix_regexp_ = entire_regexp_->Incref(); 211 212 // Two thirds of the memory goes to the forward Prog, 213 // one third to the reverse prog, because the forward 214 // Prog has two DFAs but the reverse prog has one. 215 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); 216 if (prog_ == NULL) { 217 if (options_.log_errors()) 218 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; 219 error_ = new string("pattern too large - compile failed"); 220 error_code_ = RE2::ErrorPatternTooLarge; 221 return; 222 } 223 224 // Could delay this until the first match call that 225 // cares about submatch information, but the one-pass 226 // machine's memory gets cut from the DFA memory budget, 227 // and that is harder to do if the DFA has already 228 // been built. 229 is_one_pass_ = prog_->IsOnePass(); 230 } 231 232 // Returns rprog_, computing it if needed. 233 re2::Prog* RE2::ReverseProg() const { 234 MutexLock l(mutex_); 235 if (rprog_ == NULL && error_ == empty_string) { 236 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); 237 if (rprog_ == NULL) { 238 if (options_.log_errors()) 239 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; 240 error_ = new string("pattern too large - reverse compile failed"); 241 error_code_ = RE2::ErrorPatternTooLarge; 242 return NULL; 243 } 244 } 245 return rprog_; 246 } 247 248 RE2::~RE2() { 249 if (suffix_regexp_) 250 suffix_regexp_->Decref(); 251 if (entire_regexp_) 252 entire_regexp_->Decref(); 253 delete mutex_; 254 delete prog_; 255 delete rprog_; 256 if (error_ != empty_string) 257 delete error_; 258 if (named_groups_ != NULL && named_groups_ != empty_named_groups) 259 delete named_groups_; 260 if (group_names_ != NULL && group_names_ != empty_group_names) 261 delete group_names_; 262 } 263 264 int RE2::ProgramSize() const { 265 if (prog_ == NULL) 266 return -1; 267 return prog_->size(); 268 } 269 270 // Returns named_groups_, computing it if needed. 271 const map<string, int>& RE2::NamedCapturingGroups() const { 272 MutexLock l(mutex_); 273 if (!ok()) 274 return *empty_named_groups; 275 if (named_groups_ == NULL) { 276 named_groups_ = suffix_regexp_->NamedCaptures(); 277 if (named_groups_ == NULL) 278 named_groups_ = empty_named_groups; 279 } 280 return *named_groups_; 281 } 282 283 // Returns group_names_, computing it if needed. 284 const map<int, string>& RE2::CapturingGroupNames() const { 285 MutexLock l(mutex_); 286 if (!ok()) 287 return *empty_group_names; 288 if (group_names_ == NULL) { 289 group_names_ = suffix_regexp_->CaptureNames(); 290 if (group_names_ == NULL) 291 group_names_ = empty_group_names; 292 } 293 return *group_names_; 294 } 295 296 /***** Convenience interfaces *****/ 297 298 bool RE2::FullMatchN(const StringPiece& text, const RE2& re, 299 const Arg* const args[], int n) { 300 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); 301 } 302 303 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, 304 const Arg* const args[], int n) { 305 return re.DoMatch(text, UNANCHORED, NULL, args, n); 306 } 307 308 bool RE2::ConsumeN(StringPiece* input, const RE2& re, 309 const Arg* const args[], int n) { 310 int consumed; 311 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { 312 input->remove_prefix(consumed); 313 return true; 314 } else { 315 return false; 316 } 317 } 318 319 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, 320 const Arg* const args[], int n) { 321 int consumed; 322 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { 323 input->remove_prefix(consumed); 324 return true; 325 } else { 326 return false; 327 } 328 } 329 330 // Returns the maximum submatch needed for the rewrite to be done by Replace(). 331 // E.g. if rewrite == "foo \\2,\\1", returns 2. 332 int RE2::MaxSubmatch(const StringPiece& rewrite) { 333 int max = 0; 334 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 335 s < end; s++) { 336 if (*s == '\\') { 337 s++; 338 int c = (s < end) ? *s : -1; 339 if (isdigit(c)) { 340 int n = (c - '0'); 341 if (n > max) 342 max = n; 343 } 344 } 345 } 346 return max; 347 } 348 349 bool RE2::Replace(string *str, 350 const RE2& re, 351 const StringPiece& rewrite) { 352 StringPiece vec[kVecSize]; 353 int nvec = 1 + MaxSubmatch(rewrite); 354 if (nvec > arraysize(vec)) 355 return false; 356 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) 357 return false; 358 359 string s; 360 if (!re.Rewrite(&s, rewrite, vec, nvec)) 361 return false; 362 363 assert(vec[0].begin() >= str->data()); 364 assert(vec[0].end() <= str->data()+str->size()); 365 str->replace(vec[0].data() - str->data(), vec[0].size(), s); 366 return true; 367 } 368 369 int RE2::GlobalReplace(string *str, 370 const RE2& re, 371 const StringPiece& rewrite) { 372 StringPiece vec[kVecSize]; 373 int nvec = 1 + MaxSubmatch(rewrite); 374 if (nvec > arraysize(vec)) 375 return false; 376 377 const char* p = str->data(); 378 const char* ep = p + str->size(); 379 const char* lastend = NULL; 380 string out; 381 int count = 0; 382 while (p <= ep) { 383 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) 384 break; 385 if (p < vec[0].begin()) 386 out.append(p, vec[0].begin() - p); 387 if (vec[0].begin() == lastend && vec[0].size() == 0) { 388 // Disallow empty match at end of last match: skip ahead. 389 if (p < ep) 390 out.append(p, 1); 391 p++; 392 continue; 393 } 394 re.Rewrite(&out, rewrite, vec, nvec); 395 p = vec[0].end(); 396 lastend = p; 397 count++; 398 } 399 400 if (count == 0) 401 return 0; 402 403 if (p < ep) 404 out.append(p, ep - p); 405 swap(out, *str); 406 return count; 407 } 408 409 bool RE2::Extract(const StringPiece &text, 410 const RE2& re, 411 const StringPiece &rewrite, 412 string *out) { 413 StringPiece vec[kVecSize]; 414 int nvec = 1 + MaxSubmatch(rewrite); 415 if (nvec > arraysize(vec)) 416 return false; 417 418 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) 419 return false; 420 421 out->clear(); 422 return re.Rewrite(out, rewrite, vec, nvec); 423 } 424 425 string RE2::QuoteMeta(const StringPiece& unquoted) { 426 string result; 427 result.reserve(unquoted.size() << 1); 428 429 // Escape any ascii character not in [A-Za-z_0-9]. 430 // 431 // Note that it's legal to escape a character even if it has no 432 // special meaning in a regular expression -- so this function does 433 // that. (This also makes it identical to the perl function of the 434 // same name except for the null-character special case; 435 // see `perldoc -f quotemeta`.) 436 for (int ii = 0; ii < unquoted.length(); ++ii) { 437 // Note that using 'isalnum' here raises the benchmark time from 438 // 32ns to 58ns: 439 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 440 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 441 (unquoted[ii] < '0' || unquoted[ii] > '9') && 442 unquoted[ii] != '_' && 443 // If this is the part of a UTF8 or Latin1 character, we need 444 // to copy this byte without escaping. Experimentally this is 445 // what works correctly with the regexp library. 446 !(unquoted[ii] & 128)) { 447 if (unquoted[ii] == '\0') { // Special handling for null chars. 448 // Note that this special handling is not strictly required for RE2, 449 // but this quoting is required for other regexp libraries such as 450 // PCRE. 451 // Can't use "\\0" since the next character might be a digit. 452 result += "\\x00"; 453 continue; 454 } 455 result += '\\'; 456 } 457 result += unquoted[ii]; 458 } 459 460 return result; 461 } 462 463 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { 464 if (prog_ == NULL) 465 return false; 466 467 int n = prefix_.size(); 468 if (n > maxlen) 469 n = maxlen; 470 471 // Determine initial min max from prefix_ literal. 472 string pmin, pmax; 473 pmin = prefix_.substr(0, n); 474 pmax = prefix_.substr(0, n); 475 if (prefix_foldcase_) { 476 // prefix is ASCII lowercase; change pmin to uppercase. 477 for (int i = 0; i < n; i++) { 478 if ('a' <= pmin[i] && pmin[i] <= 'z') 479 pmin[i] += 'A' - 'a'; 480 } 481 } 482 483 // Add to prefix min max using PossibleMatchRange on regexp. 484 string dmin, dmax; 485 maxlen -= n; 486 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { 487 pmin += dmin; 488 pmax += dmax; 489 } else if (pmax.size() > 0) { 490 // prog_->PossibleMatchRange has failed us, 491 // but we still have useful information from prefix_. 492 // Round up pmax to allow any possible suffix. 493 pmax = PrefixSuccessor(pmax); 494 } else { 495 // Nothing useful. 496 *min = ""; 497 *max = ""; 498 return false; 499 } 500 501 *min = pmin; 502 *max = pmax; 503 return true; 504 } 505 506 // Avoid possible locale nonsense in standard strcasecmp. 507 // The string a is known to be all lowercase. 508 static int ascii_strcasecmp(const char* a, const char* b, int len) { 509 const char *ae = a + len; 510 511 for (; a < ae; a++, b++) { 512 uint8 x = *a; 513 uint8 y = *b; 514 if ('A' <= y && y <= 'Z') 515 y += 'a' - 'A'; 516 if (x != y) 517 return x - y; 518 } 519 return 0; 520 } 521 522 523 /***** Actual matching and rewriting code *****/ 524 525 bool RE2::Match(const StringPiece& text, 526 int startpos, 527 int endpos, 528 Anchor re_anchor, 529 StringPiece* submatch, 530 int nsubmatch) const { 531 if (!ok() || suffix_regexp_ == NULL) { 532 if (options_.log_errors()) 533 LOG(ERROR) << "Invalid RE2: " << *error_; 534 return false; 535 } 536 537 if (startpos < 0 || startpos > endpos || endpos > text.size()) { 538 if (options_.log_errors()) 539 LOG(ERROR) << "RE2: invalid startpos, endpos pair."; 540 return false; 541 } 542 543 StringPiece subtext = text; 544 subtext.remove_prefix(startpos); 545 subtext.remove_suffix(text.size() - endpos); 546 547 // Use DFAs to find exact location of match, filter out non-matches. 548 549 // Don't ask for the location if we won't use it. 550 // SearchDFA can do extra optimizations in that case. 551 StringPiece match; 552 StringPiece* matchp = &match; 553 if (nsubmatch == 0) 554 matchp = NULL; 555 556 int ncap = 1 + NumberOfCapturingGroups(); 557 if (ncap > nsubmatch) 558 ncap = nsubmatch; 559 560 // If the regexp is anchored explicitly, must not be in middle of text. 561 if (prog_->anchor_start() && startpos != 0) 562 return false; 563 564 // If the regexp is anchored explicitly, update re_anchor 565 // so that we can potentially fall into a faster case below. 566 if (prog_->anchor_start() && prog_->anchor_end()) 567 re_anchor = ANCHOR_BOTH; 568 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) 569 re_anchor = ANCHOR_START; 570 571 // Check for the required prefix, if any. 572 int prefixlen = 0; 573 if (!prefix_.empty()) { 574 if (startpos != 0) 575 return false; 576 prefixlen = prefix_.size(); 577 if (prefixlen > subtext.size()) 578 return false; 579 if (prefix_foldcase_) { 580 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) 581 return false; 582 } else { 583 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) 584 return false; 585 } 586 subtext.remove_prefix(prefixlen); 587 // If there is a required prefix, the anchor must be at least ANCHOR_START. 588 if (re_anchor != ANCHOR_BOTH) 589 re_anchor = ANCHOR_START; 590 } 591 592 Prog::Anchor anchor = Prog::kUnanchored; 593 Prog::MatchKind kind = Prog::kFirstMatch; 594 if (options_.longest_match()) 595 kind = Prog::kLongestMatch; 596 bool skipped_test = false; 597 598 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); 599 600 // SearchBitState allocates a bit vector of size prog_->size() * text.size(). 601 // It also allocates a stack of 3-word structures which could potentially 602 // grow as large as prog_->size() * text.size() but in practice is much 603 // smaller. 604 // Conditions for using SearchBitState: 605 const int MaxBitStateProg = 500; // prog_->size() <= Max. 606 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) 607 bool can_bit_state = prog_->size() <= MaxBitStateProg; 608 int bit_state_text_max = MaxBitStateVector / prog_->size(); 609 610 bool dfa_failed = false; 611 switch (re_anchor) { 612 default: 613 case UNANCHORED: { 614 if (!prog_->SearchDFA(subtext, text, anchor, kind, 615 matchp, &dfa_failed, NULL)) { 616 if (dfa_failed) { 617 // Fall back to NFA below. 618 skipped_test = true; 619 if (FLAGS_trace_re2) 620 LOG(INFO) << "Match " << trunc(pattern_) 621 << " [" << CEscape(subtext) << "]" 622 << " DFA failed."; 623 break; 624 } 625 if (FLAGS_trace_re2) 626 LOG(INFO) << "Match " << trunc(pattern_) 627 << " [" << CEscape(subtext) << "]" 628 << " used DFA - no match."; 629 return false; 630 } 631 if (FLAGS_trace_re2) 632 LOG(INFO) << "Match " << trunc(pattern_) 633 << " [" << CEscape(subtext) << "]" 634 << " used DFA - match"; 635 if (matchp == NULL) // Matched. Don't care where 636 return true; 637 // SearchDFA set match[0].end() but didn't know where the 638 // match started. Run the regexp backward from match[0].end() 639 // to find the longest possible match -- that's where it started. 640 Prog* prog = ReverseProg(); 641 if (prog == NULL) 642 return false; 643 if (!prog->SearchDFA(match, text, Prog::kAnchored, 644 Prog::kLongestMatch, &match, &dfa_failed, NULL)) { 645 if (dfa_failed) { 646 // Fall back to NFA below. 647 skipped_test = true; 648 if (FLAGS_trace_re2) 649 LOG(INFO) << "Match " << trunc(pattern_) 650 << " [" << CEscape(subtext) << "]" 651 << " reverse DFA failed."; 652 break; 653 } 654 if (FLAGS_trace_re2) 655 LOG(INFO) << "Match " << trunc(pattern_) 656 << " [" << CEscape(subtext) << "]" 657 << " DFA inconsistency."; 658 if (options_.log_errors()) 659 LOG(ERROR) << "DFA inconsistency"; 660 return false; 661 } 662 if (FLAGS_trace_re2) 663 LOG(INFO) << "Match " << trunc(pattern_) 664 << " [" << CEscape(subtext) << "]" 665 << " used reverse DFA."; 666 break; 667 } 668 669 case ANCHOR_BOTH: 670 case ANCHOR_START: 671 if (re_anchor == ANCHOR_BOTH) 672 kind = Prog::kFullMatch; 673 anchor = Prog::kAnchored; 674 675 // If only a small amount of text and need submatch 676 // information anyway and we're going to use OnePass or BitState 677 // to get it, we might as well not even bother with the DFA: 678 // OnePass or BitState will be fast enough. 679 // On tiny texts, OnePass outruns even the DFA, and 680 // it doesn't have the shared state and occasional mutex that 681 // the DFA does. 682 if (can_one_pass && text.size() <= 4096 && 683 (ncap > 1 || text.size() <= 8)) { 684 if (FLAGS_trace_re2) 685 LOG(INFO) << "Match " << trunc(pattern_) 686 << " [" << CEscape(subtext) << "]" 687 << " skipping DFA for OnePass."; 688 skipped_test = true; 689 break; 690 } 691 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { 692 if (FLAGS_trace_re2) 693 LOG(INFO) << "Match " << trunc(pattern_) 694 << " [" << CEscape(subtext) << "]" 695 << " skipping DFA for BitState."; 696 skipped_test = true; 697 break; 698 } 699 if (!prog_->SearchDFA(subtext, text, anchor, kind, 700 &match, &dfa_failed, NULL)) { 701 if (dfa_failed) { 702 if (FLAGS_trace_re2) 703 LOG(INFO) << "Match " << trunc(pattern_) 704 << " [" << CEscape(subtext) << "]" 705 << " DFA failed."; 706 skipped_test = true; 707 break; 708 } 709 if (FLAGS_trace_re2) 710 LOG(INFO) << "Match " << trunc(pattern_) 711 << " [" << CEscape(subtext) << "]" 712 << " used DFA - no match."; 713 return false; 714 } 715 break; 716 } 717 718 if (!skipped_test && ncap <= 1) { 719 // We know exactly where it matches. That's enough. 720 if (ncap == 1) 721 submatch[0] = match; 722 } else { 723 StringPiece subtext1; 724 if (skipped_test) { 725 // DFA ran out of memory or was skipped: 726 // need to search in entire original text. 727 subtext1 = subtext; 728 } else { 729 // DFA found the exact match location: 730 // let NFA run an anchored, full match search 731 // to find submatch locations. 732 subtext1 = match; 733 anchor = Prog::kAnchored; 734 kind = Prog::kFullMatch; 735 } 736 737 if (can_one_pass && anchor != Prog::kUnanchored) { 738 if (FLAGS_trace_re2) 739 LOG(INFO) << "Match " << trunc(pattern_) 740 << " [" << CEscape(subtext) << "]" 741 << " using OnePass."; 742 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { 743 if (!skipped_test && options_.log_errors()) 744 LOG(ERROR) << "SearchOnePass inconsistency"; 745 return false; 746 } 747 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { 748 if (FLAGS_trace_re2) 749 LOG(INFO) << "Match " << trunc(pattern_) 750 << " [" << CEscape(subtext) << "]" 751 << " using BitState."; 752 if (!prog_->SearchBitState(subtext1, text, anchor, 753 kind, submatch, ncap)) { 754 if (!skipped_test && options_.log_errors()) 755 LOG(ERROR) << "SearchBitState inconsistency"; 756 return false; 757 } 758 } else { 759 if (FLAGS_trace_re2) 760 LOG(INFO) << "Match " << trunc(pattern_) 761 << " [" << CEscape(subtext) << "]" 762 << " using NFA."; 763 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { 764 if (!skipped_test && options_.log_errors()) 765 LOG(ERROR) << "SearchNFA inconsistency"; 766 return false; 767 } 768 } 769 } 770 771 // Adjust overall match for required prefix that we stripped off. 772 if (prefixlen > 0 && nsubmatch > 0) 773 submatch[0] = StringPiece(submatch[0].begin() - prefixlen, 774 submatch[0].size() + prefixlen); 775 776 // Zero submatches that don't exist in the regexp. 777 for (int i = ncap; i < nsubmatch; i++) 778 submatch[i] = NULL; 779 return true; 780 } 781 782 // Internal matcher - like Match() but takes Args not StringPieces. 783 bool RE2::DoMatch(const StringPiece& text, 784 Anchor anchor, 785 int* consumed, 786 const Arg* const* args, 787 int n) const { 788 if (!ok()) { 789 if (options_.log_errors()) 790 LOG(ERROR) << "Invalid RE2: " << *error_; 791 return false; 792 } 793 794 // Count number of capture groups needed. 795 int nvec; 796 if (n == 0 && consumed == NULL) 797 nvec = 0; 798 else 799 nvec = n+1; 800 801 StringPiece* vec; 802 StringPiece stkvec[kVecSize]; 803 StringPiece* heapvec = NULL; 804 805 if (nvec <= arraysize(stkvec)) { 806 vec = stkvec; 807 } else { 808 vec = new StringPiece[nvec]; 809 heapvec = vec; 810 } 811 812 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { 813 delete[] heapvec; 814 return false; 815 } 816 817 if(consumed != NULL) 818 *consumed = vec[0].end() - text.begin(); 819 820 if (n == 0 || args == NULL) { 821 // We are not interested in results 822 delete[] heapvec; 823 return true; 824 } 825 826 int ncap = NumberOfCapturingGroups(); 827 if (ncap < n) { 828 // RE has fewer capturing groups than number of arg pointers passed in 829 VLOG(1) << "Asked for " << n << " but only have " << ncap; 830 delete[] heapvec; 831 return false; 832 } 833 834 // If we got here, we must have matched the whole pattern. 835 for (int i = 0; i < n; i++) { 836 const StringPiece& s = vec[i+1]; 837 if (!args[i]->Parse(s.data(), s.size())) { 838 // TODO: Should we indicate what the error was? 839 VLOG(1) << "Parse error on #" << i << " " << s << " " 840 << (void*)s.data() << "/" << s.size(); 841 delete[] heapvec; 842 return false; 843 } 844 } 845 846 delete[] heapvec; 847 return true; 848 } 849 850 // Append the "rewrite" string, with backslash subsitutions from "vec", 851 // to string "out". 852 bool RE2::Rewrite(string *out, const StringPiece &rewrite, 853 const StringPiece *vec, int veclen) const { 854 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 855 s < end; s++) { 856 int c = *s; 857 if (c == '\\') { 858 s++; 859 c = (s < end) ? *s : -1; 860 if (isdigit(c)) { 861 int n = (c - '0'); 862 if (n >= veclen) { 863 if (options_.log_errors()) { 864 LOG(ERROR) << "requested group " << n 865 << " in regexp " << rewrite.data(); 866 } 867 return false; 868 } 869 StringPiece snip = vec[n]; 870 if (snip.size() > 0) 871 out->append(snip.data(), snip.size()); 872 } else if (c == '\\') { 873 out->push_back('\\'); 874 } else { 875 if (options_.log_errors()) 876 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); 877 return false; 878 } 879 } else { 880 out->push_back(c); 881 } 882 } 883 return true; 884 } 885 886 // Return the number of capturing subpatterns, or -1 if the 887 // regexp wasn't valid on construction. 888 int RE2::NumberOfCapturingGroups() const { 889 if (suffix_regexp_ == NULL) 890 return -1; 891 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" 892 " multiple threads end up doing the same work in parallel."); 893 if (num_captures_ == -1) 894 num_captures_ = suffix_regexp_->NumCaptures(); 895 return num_captures_; 896 } 897 898 // Checks that the rewrite string is well-formed with respect to this 899 // regular expression. 900 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { 901 int max_token = -1; 902 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 903 s < end; s++) { 904 int c = *s; 905 if (c != '\\') { 906 continue; 907 } 908 if (++s == end) { 909 *error = "Rewrite schema error: '\\' not allowed at end."; 910 return false; 911 } 912 c = *s; 913 if (c == '\\') { 914 continue; 915 } 916 if (!isdigit(c)) { 917 *error = "Rewrite schema error: " 918 "'\\' must be followed by a digit or '\\'."; 919 return false; 920 } 921 int n = (c - '0'); 922 if (max_token < n) { 923 max_token = n; 924 } 925 } 926 927 if (max_token > NumberOfCapturingGroups()) { 928 SStringPrintf(error, "Rewrite schema requests %d matches, " 929 "but the regexp only has %d parenthesized subexpressions.", 930 max_token, NumberOfCapturingGroups()); 931 return false; 932 } 933 return true; 934 } 935 936 /***** Parsers for various types *****/ 937 938 bool RE2::Arg::parse_null(const char* str, int n, void* dest) { 939 // We fail if somebody asked us to store into a non-NULL void* pointer 940 return (dest == NULL); 941 } 942 943 bool RE2::Arg::parse_string(const char* str, int n, void* dest) { 944 if (dest == NULL) return true; 945 reinterpret_cast<string*>(dest)->assign(str, n); 946 return true; 947 } 948 949 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { 950 if (dest == NULL) return true; 951 reinterpret_cast<StringPiece*>(dest)->set(str, n); 952 return true; 953 } 954 955 bool RE2::Arg::parse_char(const char* str, int n, void* dest) { 956 if (n != 1) return false; 957 if (dest == NULL) return true; 958 *(reinterpret_cast<char*>(dest)) = str[0]; 959 return true; 960 } 961 962 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { 963 if (n != 1) return false; 964 if (dest == NULL) return true; 965 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 966 return true; 967 } 968 969 // Largest number spec that we are willing to parse 970 static const int kMaxNumberLength = 32; 971 972 // REQUIRES "buf" must have length at least kMaxNumberLength+1 973 // Copies "str" into "buf" and null-terminates. 974 // Overwrites *np with the new length. 975 static const char* TerminateNumber(char* buf, const char* str, int* np) { 976 int n = *np; 977 if (n <= 0) return ""; 978 if (n > 0 && isspace(*str)) { 979 // We are less forgiving than the strtoxxx() routines and do not 980 // allow leading spaces. 981 return ""; 982 } 983 984 // Although buf has a fixed maximum size, we can still handle 985 // arbitrarily large integers correctly by omitting leading zeros. 986 // (Numbers that are still too long will be out of range.) 987 // Before deciding whether str is too long, 988 // remove leading zeros with s/000+/00/. 989 // Leaving the leading two zeros in place means that 990 // we don't change 0000x123 (invalid) into 0x123 (valid). 991 // Skip over leading - before replacing. 992 bool neg = false; 993 if (n >= 1 && str[0] == '-') { 994 neg = true; 995 n--; 996 str++; 997 } 998 999 if (n >= 3 && str[0] == '0' && str[1] == '0') { 1000 while (n >= 3 && str[2] == '0') { 1001 n--; 1002 str++; 1003 } 1004 } 1005 1006 if (neg) { // make room in buf for - 1007 n++; 1008 str--; 1009 } 1010 1011 if (n > kMaxNumberLength) return ""; 1012 1013 memmove(buf, str, n); 1014 if (neg) { 1015 buf[0] = '-'; 1016 } 1017 buf[n] = '\0'; 1018 *np = n; 1019 return buf; 1020 } 1021 1022 bool RE2::Arg::parse_long_radix(const char* str, 1023 int n, 1024 void* dest, 1025 int radix) { 1026 if (n == 0) return false; 1027 char buf[kMaxNumberLength+1]; 1028 str = TerminateNumber(buf, str, &n); 1029 char* end; 1030 errno = 0; 1031 long r = strtol(str, &end, radix); 1032 if (end != str + n) return false; // Leftover junk 1033 if (errno) return false; 1034 if (dest == NULL) return true; 1035 *(reinterpret_cast<long*>(dest)) = r; 1036 return true; 1037 } 1038 1039 bool RE2::Arg::parse_ulong_radix(const char* str, 1040 int n, 1041 void* dest, 1042 int radix) { 1043 if (n == 0) return false; 1044 char buf[kMaxNumberLength+1]; 1045 str = TerminateNumber(buf, str, &n); 1046 if (str[0] == '-') { 1047 // strtoul() will silently accept negative numbers and parse 1048 // them. This module is more strict and treats them as errors. 1049 return false; 1050 } 1051 1052 char* end; 1053 errno = 0; 1054 unsigned long r = strtoul(str, &end, radix); 1055 if (end != str + n) return false; // Leftover junk 1056 if (errno) return false; 1057 if (dest == NULL) return true; 1058 *(reinterpret_cast<unsigned long*>(dest)) = r; 1059 return true; 1060 } 1061 1062 bool RE2::Arg::parse_short_radix(const char* str, 1063 int n, 1064 void* dest, 1065 int radix) { 1066 long r; 1067 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1068 if ((short)r != r) return false; // Out of range 1069 if (dest == NULL) return true; 1070 *(reinterpret_cast<short*>(dest)) = r; 1071 return true; 1072 } 1073 1074 bool RE2::Arg::parse_ushort_radix(const char* str, 1075 int n, 1076 void* dest, 1077 int radix) { 1078 unsigned long r; 1079 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1080 if ((ushort)r != r) return false; // Out of range 1081 if (dest == NULL) return true; 1082 *(reinterpret_cast<unsigned short*>(dest)) = r; 1083 return true; 1084 } 1085 1086 bool RE2::Arg::parse_int_radix(const char* str, 1087 int n, 1088 void* dest, 1089 int radix) { 1090 long r; 1091 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1092 if ((int)r != r) return false; // Out of range 1093 if (dest == NULL) return true; 1094 *(reinterpret_cast<int*>(dest)) = r; 1095 return true; 1096 } 1097 1098 bool RE2::Arg::parse_uint_radix(const char* str, 1099 int n, 1100 void* dest, 1101 int radix) { 1102 unsigned long r; 1103 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1104 if ((uint)r != r) return false; // Out of range 1105 if (dest == NULL) return true; 1106 *(reinterpret_cast<unsigned int*>(dest)) = r; 1107 return true; 1108 } 1109 1110 bool RE2::Arg::parse_longlong_radix(const char* str, 1111 int n, 1112 void* dest, 1113 int radix) { 1114 if (n == 0) return false; 1115 char buf[kMaxNumberLength+1]; 1116 str = TerminateNumber(buf, str, &n); 1117 char* end; 1118 errno = 0; 1119 int64 r = strtoll(str, &end, radix); 1120 if (end != str + n) return false; // Leftover junk 1121 if (errno) return false; 1122 if (dest == NULL) return true; 1123 *(reinterpret_cast<int64*>(dest)) = r; 1124 return true; 1125 } 1126 1127 bool RE2::Arg::parse_ulonglong_radix(const char* str, 1128 int n, 1129 void* dest, 1130 int radix) { 1131 if (n == 0) return false; 1132 char buf[kMaxNumberLength+1]; 1133 str = TerminateNumber(buf, str, &n); 1134 if (str[0] == '-') { 1135 // strtoull() will silently accept negative numbers and parse 1136 // them. This module is more strict and treats them as errors. 1137 return false; 1138 } 1139 char* end; 1140 errno = 0; 1141 uint64 r = strtoull(str, &end, radix); 1142 if (end != str + n) return false; // Leftover junk 1143 if (errno) return false; 1144 if (dest == NULL) return true; 1145 *(reinterpret_cast<uint64*>(dest)) = r; 1146 return true; 1147 } 1148 1149 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { 1150 if (n == 0) return false; 1151 static const int kMaxLength = 200; 1152 char buf[kMaxLength]; 1153 if (n >= kMaxLength) return false; 1154 memcpy(buf, str, n); 1155 buf[n] = '\0'; 1156 errno = 0; 1157 char* end; 1158 double r; 1159 if (isfloat) { 1160 r = strtof(buf, &end); 1161 } else { 1162 r = strtod(buf, &end); 1163 } 1164 if (end != buf + n) return false; // Leftover junk 1165 if (errno) return false; 1166 if (dest == NULL) return true; 1167 if (isfloat) { 1168 *(reinterpret_cast<float*>(dest)) = r; 1169 } else { 1170 *(reinterpret_cast<double*>(dest)) = r; 1171 } 1172 return true; 1173 } 1174 1175 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { 1176 return parse_double_float(str, n, false, dest); 1177 } 1178 1179 bool RE2::Arg::parse_float(const char* str, int n, void* dest) { 1180 return parse_double_float(str, n, true, dest); 1181 } 1182 1183 1184 #define DEFINE_INTEGER_PARSERS(name) \ 1185 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ 1186 return parse_##name##_radix(str, n, dest, 10); \ 1187 } \ 1188 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 1189 return parse_##name##_radix(str, n, dest, 16); \ 1190 } \ 1191 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 1192 return parse_##name##_radix(str, n, dest, 8); \ 1193 } \ 1194 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 1195 return parse_##name##_radix(str, n, dest, 0); \ 1196 } 1197 1198 DEFINE_INTEGER_PARSERS(short); 1199 DEFINE_INTEGER_PARSERS(ushort); 1200 DEFINE_INTEGER_PARSERS(int); 1201 DEFINE_INTEGER_PARSERS(uint); 1202 DEFINE_INTEGER_PARSERS(long); 1203 DEFINE_INTEGER_PARSERS(ulong); 1204 DEFINE_INTEGER_PARSERS(longlong); 1205 DEFINE_INTEGER_PARSERS(ulonglong); 1206 1207 #undef DEFINE_INTEGER_PARSERS 1208 1209 } // namespace re2 1210