1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a POSIX regular expression matcher. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/Regex.h" 15 #include "regex_impl.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/Twine.h" 19 #include <string> 20 using namespace llvm; 21 22 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 23 24 Regex::Regex(StringRef regex, unsigned Flags) { 25 unsigned flags = 0; 26 preg = new llvm_regex(); 27 preg->re_endp = regex.end(); 28 if (Flags & IgnoreCase) 29 flags |= REG_ICASE; 30 if (Flags & Newline) 31 flags |= REG_NEWLINE; 32 if (!(Flags & BasicRegex)) 33 flags |= REG_EXTENDED; 34 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 35 } 36 37 Regex::Regex(Regex &®ex) { 38 preg = regex.preg; 39 error = regex.error; 40 regex.preg = nullptr; 41 regex.error = REG_BADPAT; 42 } 43 44 Regex::~Regex() { 45 if (preg) { 46 llvm_regfree(preg); 47 delete preg; 48 } 49 } 50 51 bool Regex::isValid(std::string &Error) { 52 if (!error) 53 return true; 54 55 size_t len = llvm_regerror(error, preg, nullptr, 0); 56 57 Error.resize(len - 1); 58 llvm_regerror(error, preg, &Error[0], len); 59 return false; 60 } 61 62 /// getNumMatches - In a valid regex, return the number of parenthesized 63 /// matches it contains. 64 unsigned Regex::getNumMatches() const { 65 return preg->re_nsub; 66 } 67 68 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 69 if (error) 70 return false; 71 72 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 73 74 // pmatch needs to have at least one element. 75 SmallVector<llvm_regmatch_t, 8> pm; 76 pm.resize(nmatch > 0 ? nmatch : 1); 77 pm[0].rm_so = 0; 78 pm[0].rm_eo = String.size(); 79 80 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 81 82 if (rc == REG_NOMATCH) 83 return false; 84 if (rc != 0) { 85 // regexec can fail due to invalid pattern or running out of memory. 86 error = rc; 87 return false; 88 } 89 90 // There was a match. 91 92 if (Matches) { // match position requested 93 Matches->clear(); 94 95 for (unsigned i = 0; i != nmatch; ++i) { 96 if (pm[i].rm_so == -1) { 97 // this group didn't match 98 Matches->push_back(StringRef()); 99 continue; 100 } 101 assert(pm[i].rm_eo >= pm[i].rm_so); 102 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 103 pm[i].rm_eo-pm[i].rm_so)); 104 } 105 } 106 107 return true; 108 } 109 110 std::string Regex::sub(StringRef Repl, StringRef String, 111 std::string *Error) { 112 SmallVector<StringRef, 8> Matches; 113 114 // Reset error, if given. 115 if (Error && !Error->empty()) *Error = ""; 116 117 // Return the input if there was no match. 118 if (!match(String, &Matches)) 119 return String; 120 121 // Otherwise splice in the replacement string, starting with the prefix before 122 // the match. 123 std::string Res(String.begin(), Matches[0].begin()); 124 125 // Then the replacement string, honoring possible substitutions. 126 while (!Repl.empty()) { 127 // Skip to the next escape. 128 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 129 130 // Add the skipped substring. 131 Res += Split.first; 132 133 // Check for terminimation and trailing backslash. 134 if (Split.second.empty()) { 135 if (Repl.size() != Split.first.size() && 136 Error && Error->empty()) 137 *Error = "replacement string contained trailing backslash"; 138 break; 139 } 140 141 // Otherwise update the replacement string and interpret escapes. 142 Repl = Split.second; 143 144 // FIXME: We should have a StringExtras function for mapping C99 escapes. 145 switch (Repl[0]) { 146 // Treat all unrecognized characters as self-quoting. 147 default: 148 Res += Repl[0]; 149 Repl = Repl.substr(1); 150 break; 151 152 // Single character escapes. 153 case 't': 154 Res += '\t'; 155 Repl = Repl.substr(1); 156 break; 157 case 'n': 158 Res += '\n'; 159 Repl = Repl.substr(1); 160 break; 161 162 // Decimal escapes are backreferences. 163 case '0': case '1': case '2': case '3': case '4': 164 case '5': case '6': case '7': case '8': case '9': { 165 // Extract the backreference number. 166 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 167 Repl = Repl.substr(Ref.size()); 168 169 unsigned RefValue; 170 if (!Ref.getAsInteger(10, RefValue) && 171 RefValue < Matches.size()) 172 Res += Matches[RefValue]; 173 else if (Error && Error->empty()) 174 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 175 break; 176 } 177 } 178 } 179 180 // And finally the suffix. 181 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 182 183 return Res; 184 } 185 186 // These are the special characters matched in functions like "p_ere_exp". 187 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 188 189 bool Regex::isLiteralERE(StringRef Str) { 190 // Check for regex metacharacters. This list was derived from our regex 191 // implementation in regcomp.c and double checked against the POSIX extended 192 // regular expression specification. 193 return Str.find_first_of(RegexMetachars) == StringRef::npos; 194 } 195 196 std::string Regex::escape(StringRef String) { 197 std::string RegexStr; 198 for (unsigned i = 0, e = String.size(); i != e; ++i) { 199 if (strchr(RegexMetachars, String[i])) 200 RegexStr += '\\'; 201 RegexStr += String[i]; 202 } 203 204 return RegexStr; 205 } 206