1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression interface RE2. 6 // 7 // Originally the PCRE C++ wrapper, but adapted to use 8 // the new automata-based regular expression engines. 9 10 #include "re2/re2.h" 11 12 #include <stdio.h> 13 #include <string> 14 #ifdef WIN32 15 #define strtoll _strtoi64 16 #define strtoull _strtoui64 17 #define strtof strtod 18 #else 19 #include <pthread.h> 20 #endif 21 #include <errno.h> 22 #include "util/util.h" 23 #include "util/flags.h" 24 #include "re2/prog.h" 25 #include "re2/regexp.h" 26 27 DEFINE_bool(trace_re2, false, "trace RE2 execution"); 28 29 namespace re2 { 30 31 // Maximum number of args we can set 32 static const int kMaxArgs = 16; 33 static const int kVecSize = 1+kMaxArgs; 34 35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {}; 36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {}; 37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {}; 38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {}; 39 40 #define kDefaultMaxMem (8<<20) 41 42 RE2::Options::Options() 43 : encoding_(EncodingUTF8), 44 posix_syntax_(false), 45 longest_match_(false), 46 log_errors_(true), 47 max_mem_(kDefaultMaxMem), 48 literal_(false), 49 never_nl_(false), 50 never_capture_(false), 51 case_sensitive_(true), 52 perl_classes_(false), 53 word_boundary_(false), 54 one_line_(false) { 55 } 56 57 RE2::Options::Options(RE2::CannedOptions opt) 58 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), 59 posix_syntax_(opt == RE2::POSIX_SYNTAX), 60 longest_match_(opt == RE2::POSIX_SYNTAX), 61 log_errors_(opt != RE2::Quiet), 62 max_mem_(kDefaultMaxMem), 63 literal_(false), 64 never_nl_(false), 65 never_capture_(false), 66 case_sensitive_(true), 67 perl_classes_(false), 68 word_boundary_(false), 69 one_line_(false) { 70 } 71 72 // static empty things for use as const references. 73 // To avoid global constructors, initialized on demand. 74 GLOBAL_MUTEX(empty_mutex); 75 static const string *empty_string; 76 static const map<string, int> *empty_named_groups; 77 static const map<int, string> *empty_group_names; 78 79 static void InitEmpty() { 80 GLOBAL_MUTEX_LOCK(empty_mutex); 81 if (empty_string == NULL) { 82 empty_string = new string; 83 empty_named_groups = new map<string, int>; 84 empty_group_names = new map<int, string>; 85 } 86 GLOBAL_MUTEX_UNLOCK(empty_mutex); 87 } 88 89 // Converts from Regexp error code to RE2 error code. 90 // Maybe some day they will diverge. In any event, this 91 // hides the existence of Regexp from RE2 users. 92 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { 93 switch (code) { 94 case re2::kRegexpSuccess: 95 return RE2::NoError; 96 case re2::kRegexpInternalError: 97 return RE2::ErrorInternal; 98 case re2::kRegexpBadEscape: 99 return RE2::ErrorBadEscape; 100 case re2::kRegexpBadCharClass: 101 return RE2::ErrorBadCharClass; 102 case re2::kRegexpBadCharRange: 103 return RE2::ErrorBadCharRange; 104 case re2::kRegexpMissingBracket: 105 return RE2::ErrorMissingBracket; 106 case re2::kRegexpMissingParen: 107 return RE2::ErrorMissingParen; 108 case re2::kRegexpTrailingBackslash: 109 return RE2::ErrorTrailingBackslash; 110 case re2::kRegexpRepeatArgument: 111 return RE2::ErrorRepeatArgument; 112 case re2::kRegexpRepeatSize: 113 return RE2::ErrorRepeatSize; 114 case re2::kRegexpRepeatOp: 115 return RE2::ErrorRepeatOp; 116 case re2::kRegexpBadPerlOp: 117 return RE2::ErrorBadPerlOp; 118 case re2::kRegexpBadUTF8: 119 return RE2::ErrorBadUTF8; 120 case re2::kRegexpBadNamedCapture: 121 return RE2::ErrorBadNamedCapture; 122 } 123 return RE2::ErrorInternal; 124 } 125 126 static string trunc(const StringPiece& pattern) { 127 if (pattern.size() < 100) 128 return pattern.as_string(); 129 return pattern.substr(0, 100).as_string() + "..."; 130 } 131 132 133 RE2::RE2(const char* pattern) { 134 Init(pattern, DefaultOptions); 135 } 136 137 RE2::RE2(const string& pattern) { 138 Init(pattern, DefaultOptions); 139 } 140 141 RE2::RE2(const StringPiece& pattern) { 142 Init(pattern, DefaultOptions); 143 } 144 145 RE2::RE2(const StringPiece& pattern, const Options& options) { 146 Init(pattern, options); 147 } 148 149 int RE2::Options::ParseFlags() const { 150 int flags = Regexp::ClassNL; 151 switch (encoding()) { 152 default: 153 if (log_errors()) 154 LOG(ERROR) << "Unknown encoding " << encoding(); 155 break; 156 case RE2::Options::EncodingUTF8: 157 break; 158 case RE2::Options::EncodingLatin1: 159 flags |= Regexp::Latin1; 160 break; 161 } 162 163 if (!posix_syntax()) 164 flags |= Regexp::LikePerl; 165 166 if (literal()) 167 flags |= Regexp::Literal; 168 169 if (never_nl()) 170 flags |= Regexp::NeverNL; 171 172 if (never_capture()) 173 flags |= Regexp::NeverCapture; 174 175 if (!case_sensitive()) 176 flags |= Regexp::FoldCase; 177 178 if (perl_classes()) 179 flags |= Regexp::PerlClasses; 180 181 if (word_boundary()) 182 flags |= Regexp::PerlB; 183 184 if (one_line()) 185 flags |= Regexp::OneLine; 186 187 return flags; 188 } 189 190 void RE2::Init(const StringPiece& pattern, const Options& options) { 191 mutex_ = new Mutex; 192 pattern_ = pattern.as_string(); 193 options_.Copy(options); 194 InitEmpty(); 195 error_ = empty_string; 196 error_code_ = NoError; 197 suffix_regexp_ = NULL; 198 entire_regexp_ = NULL; 199 prog_ = NULL; 200 rprog_ = NULL; 201 named_groups_ = NULL; 202 group_names_ = NULL; 203 num_captures_ = -1; 204 205 RegexpStatus status; 206 entire_regexp_ = Regexp::Parse( 207 pattern_, 208 static_cast<Regexp::ParseFlags>(options_.ParseFlags()), 209 &status); 210 if (entire_regexp_ == NULL) { 211 if (error_ == empty_string) 212 error_ = new string(status.Text()); 213 if (options_.log_errors()) { 214 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " 215 << status.Text(); 216 } 217 error_arg_ = status.error_arg().as_string(); 218 error_code_ = RegexpErrorToRE2(status.code()); 219 return; 220 } 221 222 prefix_.clear(); 223 prefix_foldcase_ = false; 224 re2::Regexp* suffix; 225 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) 226 suffix_regexp_ = suffix; 227 else 228 suffix_regexp_ = entire_regexp_->Incref(); 229 230 // Two thirds of the memory goes to the forward Prog, 231 // one third to the reverse prog, because the forward 232 // Prog has two DFAs but the reverse prog has one. 233 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); 234 if (prog_ == NULL) { 235 if (options_.log_errors()) 236 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; 237 error_ = new string("pattern too large - compile failed"); 238 error_code_ = RE2::ErrorPatternTooLarge; 239 return; 240 } 241 242 // Could delay this until the first match call that 243 // cares about submatch information, but the one-pass 244 // machine's memory gets cut from the DFA memory budget, 245 // and that is harder to do if the DFA has already 246 // been built. 247 is_one_pass_ = prog_->IsOnePass(); 248 } 249 250 // Returns rprog_, computing it if needed. 251 re2::Prog* RE2::ReverseProg() const { 252 MutexLock l(mutex_); 253 if (rprog_ == NULL && error_ == empty_string) { 254 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); 255 if (rprog_ == NULL) { 256 if (options_.log_errors()) 257 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; 258 error_ = new string("pattern too large - reverse compile failed"); 259 error_code_ = RE2::ErrorPatternTooLarge; 260 return NULL; 261 } 262 } 263 return rprog_; 264 } 265 266 RE2::~RE2() { 267 if (suffix_regexp_) 268 suffix_regexp_->Decref(); 269 if (entire_regexp_) 270 entire_regexp_->Decref(); 271 delete mutex_; 272 delete prog_; 273 delete rprog_; 274 if (error_ != empty_string) 275 delete error_; 276 if (named_groups_ != NULL && named_groups_ != empty_named_groups) 277 delete named_groups_; 278 if (group_names_ != NULL && group_names_ != empty_group_names) 279 delete group_names_; 280 } 281 282 int RE2::ProgramSize() const { 283 if (prog_ == NULL) 284 return -1; 285 return prog_->size(); 286 } 287 288 // Returns named_groups_, computing it if needed. 289 const map<string, int>& RE2::NamedCapturingGroups() const { 290 MutexLock l(mutex_); 291 if (!ok()) 292 return *empty_named_groups; 293 if (named_groups_ == NULL) { 294 named_groups_ = suffix_regexp_->NamedCaptures(); 295 if (named_groups_ == NULL) 296 named_groups_ = empty_named_groups; 297 } 298 return *named_groups_; 299 } 300 301 // Returns group_names_, computing it if needed. 302 const map<int, string>& RE2::CapturingGroupNames() const { 303 MutexLock l(mutex_); 304 if (!ok()) 305 return *empty_group_names; 306 if (group_names_ == NULL) { 307 group_names_ = suffix_regexp_->CaptureNames(); 308 if (group_names_ == NULL) 309 group_names_ = empty_group_names; 310 } 311 return *group_names_; 312 } 313 314 /***** Convenience interfaces *****/ 315 316 bool RE2::FullMatchN(const StringPiece& text, const RE2& re, 317 const Arg* const args[], int n) { 318 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); 319 } 320 321 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, 322 const Arg* const args[], int n) { 323 return re.DoMatch(text, UNANCHORED, NULL, args, n); 324 } 325 326 bool RE2::ConsumeN(StringPiece* input, const RE2& re, 327 const Arg* const args[], int n) { 328 int consumed; 329 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { 330 input->remove_prefix(consumed); 331 return true; 332 } else { 333 return false; 334 } 335 } 336 337 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, 338 const Arg* const args[], int n) { 339 int consumed; 340 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { 341 input->remove_prefix(consumed); 342 return true; 343 } else { 344 return false; 345 } 346 } 347 348 // Returns the maximum submatch needed for the rewrite to be done by Replace(). 349 // E.g. if rewrite == "foo \\2,\\1", returns 2. 350 int RE2::MaxSubmatch(const StringPiece& rewrite) { 351 int max = 0; 352 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 353 s < end; s++) { 354 if (*s == '\\') { 355 s++; 356 int c = (s < end) ? *s : -1; 357 if (isdigit(c)) { 358 int n = (c - '0'); 359 if (n > max) 360 max = n; 361 } 362 } 363 } 364 return max; 365 } 366 367 bool RE2::Replace(string *str, 368 const RE2& re, 369 const StringPiece& rewrite) { 370 StringPiece vec[kVecSize]; 371 int nvec = 1 + MaxSubmatch(rewrite); 372 if (nvec > arraysize(vec)) 373 return false; 374 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) 375 return false; 376 377 string s; 378 if (!re.Rewrite(&s, rewrite, vec, nvec)) 379 return false; 380 381 assert(vec[0].begin() >= str->data()); 382 assert(vec[0].end() <= str->data()+str->size()); 383 str->replace(vec[0].data() - str->data(), vec[0].size(), s); 384 return true; 385 } 386 387 int RE2::GlobalReplace(string *str, 388 const RE2& re, 389 const StringPiece& rewrite) { 390 StringPiece vec[kVecSize]; 391 int nvec = 1 + MaxSubmatch(rewrite); 392 if (nvec > arraysize(vec)) 393 return false; 394 395 const char* p = str->data(); 396 const char* ep = p + str->size(); 397 const char* lastend = NULL; 398 string out; 399 int count = 0; 400 while (p <= ep) { 401 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) 402 break; 403 if (p < vec[0].begin()) 404 out.append(p, vec[0].begin() - p); 405 if (vec[0].begin() == lastend && vec[0].size() == 0) { 406 // Disallow empty match at end of last match: skip ahead. 407 if (p < ep) 408 out.append(p, 1); 409 p++; 410 continue; 411 } 412 re.Rewrite(&out, rewrite, vec, nvec); 413 p = vec[0].end(); 414 lastend = p; 415 count++; 416 } 417 418 if (count == 0) 419 return 0; 420 421 if (p < ep) 422 out.append(p, ep - p); 423 swap(out, *str); 424 return count; 425 } 426 427 bool RE2::Extract(const StringPiece &text, 428 const RE2& re, 429 const StringPiece &rewrite, 430 string *out) { 431 StringPiece vec[kVecSize]; 432 int nvec = 1 + MaxSubmatch(rewrite); 433 if (nvec > arraysize(vec)) 434 return false; 435 436 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) 437 return false; 438 439 out->clear(); 440 return re.Rewrite(out, rewrite, vec, nvec); 441 } 442 443 string RE2::QuoteMeta(const StringPiece& unquoted) { 444 string result; 445 result.reserve(unquoted.size() << 1); 446 447 // Escape any ascii character not in [A-Za-z_0-9]. 448 // 449 // Note that it's legal to escape a character even if it has no 450 // special meaning in a regular expression -- so this function does 451 // that. (This also makes it identical to the perl function of the 452 // same name except for the null-character special case; 453 // see `perldoc -f quotemeta`.) 454 for (int ii = 0; ii < unquoted.length(); ++ii) { 455 // Note that using 'isalnum' here raises the benchmark time from 456 // 32ns to 58ns: 457 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 458 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 459 (unquoted[ii] < '0' || unquoted[ii] > '9') && 460 unquoted[ii] != '_' && 461 // If this is the part of a UTF8 or Latin1 character, we need 462 // to copy this byte without escaping. Experimentally this is 463 // what works correctly with the regexp library. 464 !(unquoted[ii] & 128)) { 465 if (unquoted[ii] == '\0') { // Special handling for null chars. 466 // Note that this special handling is not strictly required for RE2, 467 // but this quoting is required for other regexp libraries such as 468 // PCRE. 469 // Can't use "\\0" since the next character might be a digit. 470 result += "\\x00"; 471 continue; 472 } 473 result += '\\'; 474 } 475 result += unquoted[ii]; 476 } 477 478 return result; 479 } 480 481 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { 482 if (prog_ == NULL) 483 return false; 484 485 int n = prefix_.size(); 486 if (n > maxlen) 487 n = maxlen; 488 489 // Determine initial min max from prefix_ literal. 490 string pmin, pmax; 491 pmin = prefix_.substr(0, n); 492 pmax = prefix_.substr(0, n); 493 if (prefix_foldcase_) { 494 // prefix is ASCII lowercase; change pmin to uppercase. 495 for (int i = 0; i < n; i++) { 496 if ('a' <= pmin[i] && pmin[i] <= 'z') 497 pmin[i] += 'A' - 'a'; 498 } 499 } 500 501 // Add to prefix min max using PossibleMatchRange on regexp. 502 string dmin, dmax; 503 maxlen -= n; 504 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { 505 pmin += dmin; 506 pmax += dmax; 507 } else if (pmax.size() > 0) { 508 // prog_->PossibleMatchRange has failed us, 509 // but we still have useful information from prefix_. 510 // Round up pmax to allow any possible suffix. 511 pmax = PrefixSuccessor(pmax); 512 } else { 513 // Nothing useful. 514 *min = ""; 515 *max = ""; 516 return false; 517 } 518 519 *min = pmin; 520 *max = pmax; 521 return true; 522 } 523 524 // Avoid possible locale nonsense in standard strcasecmp. 525 // The string a is known to be all lowercase. 526 static int ascii_strcasecmp(const char* a, const char* b, int len) { 527 const char *ae = a + len; 528 529 for (; a < ae; a++, b++) { 530 uint8 x = *a; 531 uint8 y = *b; 532 if ('A' <= y && y <= 'Z') 533 y += 'a' - 'A'; 534 if (x != y) 535 return x - y; 536 } 537 return 0; 538 } 539 540 541 /***** Actual matching and rewriting code *****/ 542 543 bool RE2::Match(const StringPiece& text, 544 int startpos, 545 int endpos, 546 Anchor re_anchor, 547 StringPiece* submatch, 548 int nsubmatch) const { 549 if (!ok() || suffix_regexp_ == NULL) { 550 if (options_.log_errors()) 551 LOG(ERROR) << "Invalid RE2: " << *error_; 552 return false; 553 } 554 555 if (startpos < 0 || startpos > endpos || endpos > text.size()) { 556 if (options_.log_errors()) 557 LOG(ERROR) << "RE2: invalid startpos, endpos pair."; 558 return false; 559 } 560 561 StringPiece subtext = text; 562 subtext.remove_prefix(startpos); 563 subtext.remove_suffix(text.size() - endpos); 564 565 // Use DFAs to find exact location of match, filter out non-matches. 566 567 // Don't ask for the location if we won't use it. 568 // SearchDFA can do extra optimizations in that case. 569 StringPiece match; 570 StringPiece* matchp = &match; 571 if (nsubmatch == 0) 572 matchp = NULL; 573 574 int ncap = 1 + NumberOfCapturingGroups(); 575 if (ncap > nsubmatch) 576 ncap = nsubmatch; 577 578 // If the regexp is anchored explicitly, must not be in middle of text. 579 if (prog_->anchor_start() && startpos != 0) 580 return false; 581 582 // If the regexp is anchored explicitly, update re_anchor 583 // so that we can potentially fall into a faster case below. 584 if (prog_->anchor_start() && prog_->anchor_end()) 585 re_anchor = ANCHOR_BOTH; 586 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) 587 re_anchor = ANCHOR_START; 588 589 // Check for the required prefix, if any. 590 int prefixlen = 0; 591 if (!prefix_.empty()) { 592 if (startpos != 0) 593 return false; 594 prefixlen = prefix_.size(); 595 if (prefixlen > subtext.size()) 596 return false; 597 if (prefix_foldcase_) { 598 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) 599 return false; 600 } else { 601 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) 602 return false; 603 } 604 subtext.remove_prefix(prefixlen); 605 // If there is a required prefix, the anchor must be at least ANCHOR_START. 606 if (re_anchor != ANCHOR_BOTH) 607 re_anchor = ANCHOR_START; 608 } 609 610 Prog::Anchor anchor = Prog::kUnanchored; 611 Prog::MatchKind kind = Prog::kFirstMatch; 612 if (options_.longest_match()) 613 kind = Prog::kLongestMatch; 614 bool skipped_test = false; 615 616 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); 617 618 // SearchBitState allocates a bit vector of size prog_->size() * text.size(). 619 // It also allocates a stack of 3-word structures which could potentially 620 // grow as large as prog_->size() * text.size() but in practice is much 621 // smaller. 622 // Conditions for using SearchBitState: 623 const int MaxBitStateProg = 500; // prog_->size() <= Max. 624 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) 625 bool can_bit_state = prog_->size() <= MaxBitStateProg; 626 int bit_state_text_max = MaxBitStateVector / prog_->size(); 627 628 bool dfa_failed = false; 629 switch (re_anchor) { 630 default: 631 case UNANCHORED: { 632 if (!prog_->SearchDFA(subtext, text, anchor, kind, 633 matchp, &dfa_failed, NULL)) { 634 if (dfa_failed) { 635 // Fall back to NFA below. 636 skipped_test = true; 637 if (FLAGS_trace_re2) 638 LOG(INFO) << "Match " << trunc(pattern_) 639 << " [" << CEscape(subtext) << "]" 640 << " DFA failed."; 641 break; 642 } 643 if (FLAGS_trace_re2) 644 LOG(INFO) << "Match " << trunc(pattern_) 645 << " [" << CEscape(subtext) << "]" 646 << " used DFA - no match."; 647 return false; 648 } 649 if (FLAGS_trace_re2) 650 LOG(INFO) << "Match " << trunc(pattern_) 651 << " [" << CEscape(subtext) << "]" 652 << " used DFA - match"; 653 if (matchp == NULL) // Matched. Don't care where 654 return true; 655 // SearchDFA set match[0].end() but didn't know where the 656 // match started. Run the regexp backward from match[0].end() 657 // to find the longest possible match -- that's where it started. 658 Prog* prog = ReverseProg(); 659 if (prog == NULL) 660 return false; 661 if (!prog->SearchDFA(match, text, Prog::kAnchored, 662 Prog::kLongestMatch, &match, &dfa_failed, NULL)) { 663 if (dfa_failed) { 664 // Fall back to NFA below. 665 skipped_test = true; 666 if (FLAGS_trace_re2) 667 LOG(INFO) << "Match " << trunc(pattern_) 668 << " [" << CEscape(subtext) << "]" 669 << " reverse DFA failed."; 670 break; 671 } 672 if (FLAGS_trace_re2) 673 LOG(INFO) << "Match " << trunc(pattern_) 674 << " [" << CEscape(subtext) << "]" 675 << " DFA inconsistency."; 676 if (options_.log_errors()) 677 LOG(ERROR) << "DFA inconsistency"; 678 return false; 679 } 680 if (FLAGS_trace_re2) 681 LOG(INFO) << "Match " << trunc(pattern_) 682 << " [" << CEscape(subtext) << "]" 683 << " used reverse DFA."; 684 break; 685 } 686 687 case ANCHOR_BOTH: 688 case ANCHOR_START: 689 if (re_anchor == ANCHOR_BOTH) 690 kind = Prog::kFullMatch; 691 anchor = Prog::kAnchored; 692 693 // If only a small amount of text and need submatch 694 // information anyway and we're going to use OnePass or BitState 695 // to get it, we might as well not even bother with the DFA: 696 // OnePass or BitState will be fast enough. 697 // On tiny texts, OnePass outruns even the DFA, and 698 // it doesn't have the shared state and occasional mutex that 699 // the DFA does. 700 if (can_one_pass && text.size() <= 4096 && 701 (ncap > 1 || text.size() <= 8)) { 702 if (FLAGS_trace_re2) 703 LOG(INFO) << "Match " << trunc(pattern_) 704 << " [" << CEscape(subtext) << "]" 705 << " skipping DFA for OnePass."; 706 skipped_test = true; 707 break; 708 } 709 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { 710 if (FLAGS_trace_re2) 711 LOG(INFO) << "Match " << trunc(pattern_) 712 << " [" << CEscape(subtext) << "]" 713 << " skipping DFA for BitState."; 714 skipped_test = true; 715 break; 716 } 717 if (!prog_->SearchDFA(subtext, text, anchor, kind, 718 &match, &dfa_failed, NULL)) { 719 if (dfa_failed) { 720 if (FLAGS_trace_re2) 721 LOG(INFO) << "Match " << trunc(pattern_) 722 << " [" << CEscape(subtext) << "]" 723 << " DFA failed."; 724 skipped_test = true; 725 break; 726 } 727 if (FLAGS_trace_re2) 728 LOG(INFO) << "Match " << trunc(pattern_) 729 << " [" << CEscape(subtext) << "]" 730 << " used DFA - no match."; 731 return false; 732 } 733 break; 734 } 735 736 if (!skipped_test && ncap <= 1) { 737 // We know exactly where it matches. That's enough. 738 if (ncap == 1) 739 submatch[0] = match; 740 } else { 741 StringPiece subtext1; 742 if (skipped_test) { 743 // DFA ran out of memory or was skipped: 744 // need to search in entire original text. 745 subtext1 = subtext; 746 } else { 747 // DFA found the exact match location: 748 // let NFA run an anchored, full match search 749 // to find submatch locations. 750 subtext1 = match; 751 anchor = Prog::kAnchored; 752 kind = Prog::kFullMatch; 753 } 754 755 if (can_one_pass && anchor != Prog::kUnanchored) { 756 if (FLAGS_trace_re2) 757 LOG(INFO) << "Match " << trunc(pattern_) 758 << " [" << CEscape(subtext) << "]" 759 << " using OnePass."; 760 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { 761 if (!skipped_test && options_.log_errors()) 762 LOG(ERROR) << "SearchOnePass inconsistency"; 763 return false; 764 } 765 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { 766 if (FLAGS_trace_re2) 767 LOG(INFO) << "Match " << trunc(pattern_) 768 << " [" << CEscape(subtext) << "]" 769 << " using BitState."; 770 if (!prog_->SearchBitState(subtext1, text, anchor, 771 kind, submatch, ncap)) { 772 if (!skipped_test && options_.log_errors()) 773 LOG(ERROR) << "SearchBitState inconsistency"; 774 return false; 775 } 776 } else { 777 if (FLAGS_trace_re2) 778 LOG(INFO) << "Match " << trunc(pattern_) 779 << " [" << CEscape(subtext) << "]" 780 << " using NFA."; 781 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { 782 if (!skipped_test && options_.log_errors()) 783 LOG(ERROR) << "SearchNFA inconsistency"; 784 return false; 785 } 786 } 787 } 788 789 // Adjust overall match for required prefix that we stripped off. 790 if (prefixlen > 0 && nsubmatch > 0) 791 submatch[0] = StringPiece(submatch[0].begin() - prefixlen, 792 submatch[0].size() + prefixlen); 793 794 // Zero submatches that don't exist in the regexp. 795 for (int i = ncap; i < nsubmatch; i++) 796 submatch[i] = NULL; 797 return true; 798 } 799 800 // Internal matcher - like Match() but takes Args not StringPieces. 801 bool RE2::DoMatch(const StringPiece& text, 802 Anchor anchor, 803 int* consumed, 804 const Arg* const* args, 805 int n) const { 806 if (!ok()) { 807 if (options_.log_errors()) 808 LOG(ERROR) << "Invalid RE2: " << *error_; 809 return false; 810 } 811 812 // Count number of capture groups needed. 813 int nvec; 814 if (n == 0 && consumed == NULL) 815 nvec = 0; 816 else 817 nvec = n+1; 818 819 StringPiece* vec; 820 StringPiece stkvec[kVecSize]; 821 StringPiece* heapvec = NULL; 822 823 if (nvec <= arraysize(stkvec)) { 824 vec = stkvec; 825 } else { 826 vec = new StringPiece[nvec]; 827 heapvec = vec; 828 } 829 830 if (!Match(text, 0, text.size(), anchor, vec, nvec)) { 831 delete[] heapvec; 832 return false; 833 } 834 835 if(consumed != NULL) 836 *consumed = vec[0].end() - text.begin(); 837 838 if (n == 0 || args == NULL) { 839 // We are not interested in results 840 delete[] heapvec; 841 return true; 842 } 843 844 int ncap = NumberOfCapturingGroups(); 845 if (ncap < n) { 846 // RE has fewer capturing groups than number of arg pointers passed in 847 VLOG(1) << "Asked for " << n << " but only have " << ncap; 848 delete[] heapvec; 849 return false; 850 } 851 852 // If we got here, we must have matched the whole pattern. 853 for (int i = 0; i < n; i++) { 854 const StringPiece& s = vec[i+1]; 855 if (!args[i]->Parse(s.data(), s.size())) { 856 // TODO: Should we indicate what the error was? 857 VLOG(1) << "Parse error on #" << i << " " << s << " " 858 << (void*)s.data() << "/" << s.size(); 859 delete[] heapvec; 860 return false; 861 } 862 } 863 864 delete[] heapvec; 865 return true; 866 } 867 868 // Append the "rewrite" string, with backslash subsitutions from "vec", 869 // to string "out". 870 bool RE2::Rewrite(string *out, const StringPiece &rewrite, 871 const StringPiece *vec, int veclen) const { 872 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 873 s < end; s++) { 874 int c = *s; 875 if (c == '\\') { 876 s++; 877 c = (s < end) ? *s : -1; 878 if (isdigit(c)) { 879 int n = (c - '0'); 880 if (n >= veclen) { 881 if (options_.log_errors()) { 882 LOG(ERROR) << "requested group " << n 883 << " in regexp " << rewrite.data(); 884 } 885 return false; 886 } 887 StringPiece snip = vec[n]; 888 if (snip.size() > 0) 889 out->append(snip.data(), snip.size()); 890 } else if (c == '\\') { 891 out->push_back('\\'); 892 } else { 893 if (options_.log_errors()) 894 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); 895 return false; 896 } 897 } else { 898 out->push_back(c); 899 } 900 } 901 return true; 902 } 903 904 // Return the number of capturing subpatterns, or -1 if the 905 // regexp wasn't valid on construction. 906 int RE2::NumberOfCapturingGroups() const { 907 if (suffix_regexp_ == NULL) 908 return -1; 909 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" 910 " multiple threads end up doing the same work in parallel."); 911 if (num_captures_ == -1) 912 num_captures_ = suffix_regexp_->NumCaptures(); 913 return num_captures_; 914 } 915 916 // Checks that the rewrite string is well-formed with respect to this 917 // regular expression. 918 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { 919 int max_token = -1; 920 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 921 s < end; s++) { 922 int c = *s; 923 if (c != '\\') { 924 continue; 925 } 926 if (++s == end) { 927 *error = "Rewrite schema error: '\\' not allowed at end."; 928 return false; 929 } 930 c = *s; 931 if (c == '\\') { 932 continue; 933 } 934 if (!isdigit(c)) { 935 *error = "Rewrite schema error: " 936 "'\\' must be followed by a digit or '\\'."; 937 return false; 938 } 939 int n = (c - '0'); 940 if (max_token < n) { 941 max_token = n; 942 } 943 } 944 945 if (max_token > NumberOfCapturingGroups()) { 946 SStringPrintf(error, "Rewrite schema requests %d matches, " 947 "but the regexp only has %d parenthesized subexpressions.", 948 max_token, NumberOfCapturingGroups()); 949 return false; 950 } 951 return true; 952 } 953 954 /***** Parsers for various types *****/ 955 956 bool RE2::Arg::parse_null(const char* str, int n, void* dest) { 957 // We fail if somebody asked us to store into a non-NULL void* pointer 958 return (dest == NULL); 959 } 960 961 bool RE2::Arg::parse_string(const char* str, int n, void* dest) { 962 if (dest == NULL) return true; 963 reinterpret_cast<string*>(dest)->assign(str, n); 964 return true; 965 } 966 967 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { 968 if (dest == NULL) return true; 969 reinterpret_cast<StringPiece*>(dest)->set(str, n); 970 return true; 971 } 972 973 bool RE2::Arg::parse_char(const char* str, int n, void* dest) { 974 if (n != 1) return false; 975 if (dest == NULL) return true; 976 *(reinterpret_cast<char*>(dest)) = str[0]; 977 return true; 978 } 979 980 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { 981 if (n != 1) return false; 982 if (dest == NULL) return true; 983 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 984 return true; 985 } 986 987 // Largest number spec that we are willing to parse 988 static const int kMaxNumberLength = 32; 989 990 // REQUIRES "buf" must have length at least kMaxNumberLength+1 991 // Copies "str" into "buf" and null-terminates. 992 // Overwrites *np with the new length. 993 static const char* TerminateNumber(char* buf, const char* str, int* np) { 994 int n = *np; 995 if (n <= 0) return ""; 996 if (n > 0 && isspace(*str)) { 997 // We are less forgiving than the strtoxxx() routines and do not 998 // allow leading spaces. 999 return ""; 1000 } 1001 1002 // Although buf has a fixed maximum size, we can still handle 1003 // arbitrarily large integers correctly by omitting leading zeros. 1004 // (Numbers that are still too long will be out of range.) 1005 // Before deciding whether str is too long, 1006 // remove leading zeros with s/000+/00/. 1007 // Leaving the leading two zeros in place means that 1008 // we don't change 0000x123 (invalid) into 0x123 (valid). 1009 // Skip over leading - before replacing. 1010 bool neg = false; 1011 if (n >= 1 && str[0] == '-') { 1012 neg = true; 1013 n--; 1014 str++; 1015 } 1016 1017 if (n >= 3 && str[0] == '0' && str[1] == '0') { 1018 while (n >= 3 && str[2] == '0') { 1019 n--; 1020 str++; 1021 } 1022 } 1023 1024 if (neg) { // make room in buf for - 1025 n++; 1026 str--; 1027 } 1028 1029 if (n > kMaxNumberLength) return ""; 1030 1031 memmove(buf, str, n); 1032 if (neg) { 1033 buf[0] = '-'; 1034 } 1035 buf[n] = '\0'; 1036 *np = n; 1037 return buf; 1038 } 1039 1040 bool RE2::Arg::parse_long_radix(const char* str, 1041 int n, 1042 void* dest, 1043 int radix) { 1044 if (n == 0) return false; 1045 char buf[kMaxNumberLength+1]; 1046 str = TerminateNumber(buf, str, &n); 1047 char* end; 1048 errno = 0; 1049 long r = strtol(str, &end, radix); 1050 if (end != str + n) return false; // Leftover junk 1051 if (errno) return false; 1052 if (dest == NULL) return true; 1053 *(reinterpret_cast<long*>(dest)) = r; 1054 return true; 1055 } 1056 1057 bool RE2::Arg::parse_ulong_radix(const char* str, 1058 int n, 1059 void* dest, 1060 int radix) { 1061 if (n == 0) return false; 1062 char buf[kMaxNumberLength+1]; 1063 str = TerminateNumber(buf, str, &n); 1064 if (str[0] == '-') { 1065 // strtoul() will silently accept negative numbers and parse 1066 // them. This module is more strict and treats them as errors. 1067 return false; 1068 } 1069 1070 char* end; 1071 errno = 0; 1072 unsigned long r = strtoul(str, &end, radix); 1073 if (end != str + n) return false; // Leftover junk 1074 if (errno) return false; 1075 if (dest == NULL) return true; 1076 *(reinterpret_cast<unsigned long*>(dest)) = r; 1077 return true; 1078 } 1079 1080 bool RE2::Arg::parse_short_radix(const char* str, 1081 int n, 1082 void* dest, 1083 int radix) { 1084 long r; 1085 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1086 if ((short)r != r) return false; // Out of range 1087 if (dest == NULL) return true; 1088 *(reinterpret_cast<short*>(dest)) = r; 1089 return true; 1090 } 1091 1092 bool RE2::Arg::parse_ushort_radix(const char* str, 1093 int n, 1094 void* dest, 1095 int radix) { 1096 unsigned long r; 1097 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1098 if ((ushort)r != r) return false; // Out of range 1099 if (dest == NULL) return true; 1100 *(reinterpret_cast<unsigned short*>(dest)) = r; 1101 return true; 1102 } 1103 1104 bool RE2::Arg::parse_int_radix(const char* str, 1105 int n, 1106 void* dest, 1107 int radix) { 1108 long r; 1109 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 1110 if ((int)r != r) return false; // Out of range 1111 if (dest == NULL) return true; 1112 *(reinterpret_cast<int*>(dest)) = r; 1113 return true; 1114 } 1115 1116 bool RE2::Arg::parse_uint_radix(const char* str, 1117 int n, 1118 void* dest, 1119 int radix) { 1120 unsigned long r; 1121 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 1122 if ((uint)r != r) return false; // Out of range 1123 if (dest == NULL) return true; 1124 *(reinterpret_cast<unsigned int*>(dest)) = r; 1125 return true; 1126 } 1127 1128 bool RE2::Arg::parse_longlong_radix(const char* str, 1129 int n, 1130 void* dest, 1131 int radix) { 1132 if (n == 0) return false; 1133 char buf[kMaxNumberLength+1]; 1134 str = TerminateNumber(buf, str, &n); 1135 char* end; 1136 errno = 0; 1137 int64 r = strtoll(str, &end, radix); 1138 if (end != str + n) return false; // Leftover junk 1139 if (errno) return false; 1140 if (dest == NULL) return true; 1141 *(reinterpret_cast<int64*>(dest)) = r; 1142 return true; 1143 } 1144 1145 bool RE2::Arg::parse_ulonglong_radix(const char* str, 1146 int n, 1147 void* dest, 1148 int radix) { 1149 if (n == 0) return false; 1150 char buf[kMaxNumberLength+1]; 1151 str = TerminateNumber(buf, str, &n); 1152 if (str[0] == '-') { 1153 // strtoull() will silently accept negative numbers and parse 1154 // them. This module is more strict and treats them as errors. 1155 return false; 1156 } 1157 char* end; 1158 errno = 0; 1159 uint64 r = strtoull(str, &end, radix); 1160 if (end != str + n) return false; // Leftover junk 1161 if (errno) return false; 1162 if (dest == NULL) return true; 1163 *(reinterpret_cast<uint64*>(dest)) = r; 1164 return true; 1165 } 1166 1167 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { 1168 if (n == 0) return false; 1169 static const int kMaxLength = 200; 1170 char buf[kMaxLength]; 1171 if (n >= kMaxLength) return false; 1172 memcpy(buf, str, n); 1173 buf[n] = '\0'; 1174 errno = 0; 1175 char* end; 1176 double r; 1177 if (isfloat) { 1178 r = strtof(buf, &end); 1179 } else { 1180 r = strtod(buf, &end); 1181 } 1182 if (end != buf + n) return false; // Leftover junk 1183 if (errno) return false; 1184 if (dest == NULL) return true; 1185 if (isfloat) { 1186 *(reinterpret_cast<float*>(dest)) = r; 1187 } else { 1188 *(reinterpret_cast<double*>(dest)) = r; 1189 } 1190 return true; 1191 } 1192 1193 bool RE2::Arg::parse_double(const char* str, int n, void* dest) { 1194 return parse_double_float(str, n, false, dest); 1195 } 1196 1197 bool RE2::Arg::parse_float(const char* str, int n, void* dest) { 1198 return parse_double_float(str, n, true, dest); 1199 } 1200 1201 1202 #define DEFINE_INTEGER_PARSERS(name) \ 1203 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ 1204 return parse_##name##_radix(str, n, dest, 10); \ 1205 } \ 1206 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 1207 return parse_##name##_radix(str, n, dest, 16); \ 1208 } \ 1209 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 1210 return parse_##name##_radix(str, n, dest, 8); \ 1211 } \ 1212 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 1213 return parse_##name##_radix(str, n, dest, 0); \ 1214 } 1215 1216 DEFINE_INTEGER_PARSERS(short); 1217 DEFINE_INTEGER_PARSERS(ushort); 1218 DEFINE_INTEGER_PARSERS(int); 1219 DEFINE_INTEGER_PARSERS(uint); 1220 DEFINE_INTEGER_PARSERS(long); 1221 DEFINE_INTEGER_PARSERS(ulong); 1222 DEFINE_INTEGER_PARSERS(longlong); 1223 DEFINE_INTEGER_PARSERS(ulonglong); 1224 1225 #undef DEFINE_INTEGER_PARSERS 1226 1227 } // namespace re2 1228