1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a POSIX regular expression matcher. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/Regex.h" 15 #include "regex_impl.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/Support/ErrorHandling.h" 18 #include "llvm/Support/raw_ostream.h" 19 #include <string> 20 using namespace llvm; 21 22 Regex::Regex(StringRef regex, unsigned Flags) { 23 unsigned flags = 0; 24 preg = new llvm_regex(); 25 preg->re_endp = regex.end(); 26 if (Flags & IgnoreCase) 27 flags |= REG_ICASE; 28 if (Flags & Newline) 29 flags |= REG_NEWLINE; 30 if (!(Flags & BasicRegex)) 31 flags |= REG_EXTENDED; 32 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 33 } 34 35 Regex::~Regex() { 36 llvm_regfree(preg); 37 delete preg; 38 } 39 40 bool Regex::isValid(std::string &Error) { 41 if (!error) 42 return true; 43 44 size_t len = llvm_regerror(error, preg, NULL, 0); 45 46 Error.resize(len); 47 llvm_regerror(error, preg, &Error[0], len); 48 return false; 49 } 50 51 /// getNumMatches - In a valid regex, return the number of parenthesized 52 /// matches it contains. 53 unsigned Regex::getNumMatches() const { 54 return preg->re_nsub; 55 } 56 57 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 58 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 59 60 // pmatch needs to have at least one element. 61 SmallVector<llvm_regmatch_t, 8> pm; 62 pm.resize(nmatch > 0 ? nmatch : 1); 63 pm[0].rm_so = 0; 64 pm[0].rm_eo = String.size(); 65 66 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 67 68 if (rc == REG_NOMATCH) 69 return false; 70 if (rc != 0) { 71 // regexec can fail due to invalid pattern or running out of memory. 72 error = rc; 73 return false; 74 } 75 76 // There was a match. 77 78 if (Matches) { // match position requested 79 Matches->clear(); 80 81 for (unsigned i = 0; i != nmatch; ++i) { 82 if (pm[i].rm_so == -1) { 83 // this group didn't match 84 Matches->push_back(StringRef()); 85 continue; 86 } 87 assert(pm[i].rm_eo >= pm[i].rm_so); 88 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 89 pm[i].rm_eo-pm[i].rm_so)); 90 } 91 } 92 93 return true; 94 } 95 96 std::string Regex::sub(StringRef Repl, StringRef String, 97 std::string *Error) { 98 SmallVector<StringRef, 8> Matches; 99 100 // Reset error, if given. 101 if (Error && !Error->empty()) *Error = ""; 102 103 // Return the input if there was no match. 104 if (!match(String, &Matches)) 105 return String; 106 107 // Otherwise splice in the replacement string, starting with the prefix before 108 // the match. 109 std::string Res(String.begin(), Matches[0].begin()); 110 111 // Then the replacement string, honoring possible substitutions. 112 while (!Repl.empty()) { 113 // Skip to the next escape. 114 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 115 116 // Add the skipped substring. 117 Res += Split.first; 118 119 // Check for terminimation and trailing backslash. 120 if (Split.second.empty()) { 121 if (Repl.size() != Split.first.size() && 122 Error && Error->empty()) 123 *Error = "replacement string contained trailing backslash"; 124 break; 125 } 126 127 // Otherwise update the replacement string and interpret escapes. 128 Repl = Split.second; 129 130 // FIXME: We should have a StringExtras function for mapping C99 escapes. 131 switch (Repl[0]) { 132 // Treat all unrecognized characters as self-quoting. 133 default: 134 Res += Repl[0]; 135 Repl = Repl.substr(1); 136 break; 137 138 // Single character escapes. 139 case 't': 140 Res += '\t'; 141 Repl = Repl.substr(1); 142 break; 143 case 'n': 144 Res += '\n'; 145 Repl = Repl.substr(1); 146 break; 147 148 // Decimal escapes are backreferences. 149 case '0': case '1': case '2': case '3': case '4': 150 case '5': case '6': case '7': case '8': case '9': { 151 // Extract the backreference number. 152 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 153 Repl = Repl.substr(Ref.size()); 154 155 unsigned RefValue; 156 if (!Ref.getAsInteger(10, RefValue) && 157 RefValue < Matches.size()) 158 Res += Matches[RefValue]; 159 else if (Error && Error->empty()) 160 *Error = "invalid backreference string '" + Ref.str() + "'"; 161 break; 162 } 163 } 164 } 165 166 // And finally the suffix. 167 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 168 169 return Res; 170 } 171 172 bool Regex::isLiteralERE(StringRef Str) { 173 // Check for regex metacharacters. This list was derived from our regex 174 // implementation in regcomp.c and double checked against the POSIX extended 175 // regular expression specification. 176 return Str.find_first_of("()^$|*+?.[]\\{}") == StringRef::npos; 177 } 178