1 /** 2 * @file op_regex.cpp 3 * This file contains implementation for a lightweight wrapper around 4 * libc regex, providing regular expression match and replace facility. 5 * 6 * @remark Copyright 2003 OProfile authors 7 * @remark Read the file COPYING 8 * @remark Idea comes from TextFilt project <http://textfilt.sourceforge.net> 9 * 10 * @author Philippe Elie 11 */ 12 13 #include <cerrno> 14 15 #include <iostream> 16 #include <fstream> 17 18 #include "string_manip.h" 19 20 #include "op_regex.h" 21 22 using namespace std; 23 24 namespace { 25 26 string op_regerror(int err, regex_t const & regexp) 27 { 28 size_t needed_size = regerror(err, ®exp, 0, 0); 29 char * buffer = new char[needed_size]; 30 regerror(err, ®exp, buffer, needed_size); 31 32 return buffer; 33 } 34 35 36 void op_regcomp(regex_t & regexp, string const & pattern) 37 { 38 int err = regcomp(®exp, pattern.c_str(), REG_EXTENDED); 39 if (err) { 40 throw bad_regex("regcomp error: " + op_regerror(err, regexp) 41 + " for pattern : " + pattern); 42 } 43 } 44 45 46 bool op_regexec(regex_t const & regex, string const & str, regmatch_t * match, 47 size_t nmatch) 48 { 49 return regexec(®ex, str.c_str(), nmatch, match, 0) != REG_NOMATCH; 50 } 51 52 53 void op_regfree(regex_t & regexp) 54 { 55 regfree(®exp); 56 } 57 58 59 // return the index number associated with a char seen in a "\x". 60 // Allowed range are for x is [0-9a-z] return size_t(-1) if x is not in 61 // these ranges. 62 size_t subexpr_index(char ch) 63 { 64 if (isdigit(ch)) 65 return ch - '0'; 66 if (ch >= 'a' && ch <= 'z') 67 return ch - 'a' + 10; 68 return size_t(-1); 69 } 70 71 } // anonymous namespace 72 73 74 bad_regex::bad_regex(string const & pattern) 75 : op_exception(pattern) 76 { 77 } 78 79 80 regular_expression_replace::regular_expression_replace(size_t limit_, 81 size_t limit_defs) 82 : 83 limit(limit_), 84 limit_defs_expansion(limit_defs) 85 { 86 } 87 88 89 regular_expression_replace::~regular_expression_replace() 90 { 91 for (size_t i = 0 ; i < regex_replace.size() ; ++i) 92 op_regfree(regex_replace[i].regexp); 93 } 94 95 96 void regular_expression_replace::add_definition(string const & name, 97 string const & definition) 98 { 99 defs[name] = expand_string(definition); 100 } 101 102 103 void regular_expression_replace::add_pattern(string const & pattern, 104 string const & replace) 105 { 106 string expanded_pattern = expand_string(pattern); 107 108 regex_t regexp; 109 op_regcomp(regexp, expanded_pattern); 110 replace_t regex = { regexp, replace }; 111 regex_replace.push_back(regex); 112 } 113 114 115 string regular_expression_replace::expand_string(string const & input) 116 { 117 string last, expanded(input); 118 size_t i = 0; 119 for (i = 0 ; i < limit_defs_expansion ; ++i) { 120 last = expanded; 121 expanded = substitute_definition(last); 122 if (expanded == last) 123 break; 124 } 125 126 if (i == limit_defs_expansion) 127 throw bad_regex("too many substitution for: + input"); 128 129 return last; 130 } 131 132 133 string regular_expression_replace::substitute_definition(string const & pattern) 134 { 135 string result; 136 bool previous_is_escape = false; 137 138 for (size_t i = 0 ; i < pattern.length() ; ++i) { 139 if (pattern[i] == '$' && !previous_is_escape) { 140 size_t pos = pattern.find('{', i); 141 if (pos != i + 1) { 142 throw bad_regex("invalid $ in pattern: " + pattern); 143 } 144 size_t end = pattern.find('}', i); 145 if (end == string::npos) { 146 throw bad_regex("no matching '}' in pattern: " + pattern); 147 } 148 string def_name = pattern.substr(pos+1, (end-pos) - 1); 149 if (defs.find(def_name) == defs.end()) { 150 throw bad_regex("definition not found and used in pattern: (" 151 + def_name + ") " + pattern); 152 } 153 result += defs[def_name]; 154 i = end; 155 } else { 156 if (pattern[i] == '\\' && !previous_is_escape) 157 previous_is_escape = true; 158 else 159 previous_is_escape = false; 160 result += pattern[i]; 161 } 162 } 163 164 return result; 165 } 166 167 168 // FIXME limit output string size ? (cause we can have exponential growing 169 // of output string through a rule "a" = "aa") 170 bool regular_expression_replace::execute(string & str) const 171 { 172 bool changed = true; 173 for (size_t nr_iter = 0; changed && nr_iter < limit ; ++nr_iter) { 174 changed = false; 175 for (size_t i = 0 ; i < regex_replace.size() ; ++i) { 176 if (do_execute(str, regex_replace[i])) 177 changed = true; 178 } 179 } 180 181 // this don't return if the input string has been changed but if 182 // we reach the limit number of iteration. 183 return changed == false; 184 } 185 186 187 bool regular_expression_replace::do_execute(string & str, 188 replace_t const & regexp) const 189 { 190 bool changed = false; 191 192 regmatch_t match[max_match]; 193 for (size_t iter = 0; 194 op_regexec(regexp.regexp, str, match, max_match) && iter < limit; 195 iter++) { 196 changed = true; 197 do_replace(str, regexp.replace, match); 198 } 199 200 return changed; 201 } 202 203 204 regmatch_t const & 205 regular_expression_replace::get_match(regmatch_t const * match, char idx) const 206 { 207 size_t sub_expr = subexpr_index(idx); 208 if (sub_expr == size_t(-1)) 209 throw bad_regex("expect group index: " + idx); 210 if (sub_expr >= max_match) 211 throw bad_regex("illegal group index :" + idx); 212 return match[sub_expr]; 213 } 214 215 void regular_expression_replace::do_replace 216 (string & str, string const & replace, regmatch_t const * match) const 217 { 218 string inserted; 219 for (size_t i = 0 ; i < replace.length() ; ++i) { 220 if (replace[i] == '\\') { 221 if (i == replace.length() - 1) { 222 throw bad_regex("illegal \\ trailer: " + 223 replace); 224 } 225 ++i; 226 if (replace[i] == '\\') { 227 inserted += '\\'; 228 } else { 229 regmatch_t const & matched = get_match(match, 230 replace[i]); 231 if (matched.rm_so == -1 && 232 matched.rm_eo == -1) { 233 // empty match: nothing todo 234 } else if (matched.rm_so == -1 || 235 matched.rm_eo == -1) { 236 throw bad_regex("illegal match: " + 237 replace); 238 } else { 239 inserted += str.substr(matched.rm_so, 240 matched.rm_eo - matched.rm_so); 241 } 242 } 243 } else { 244 inserted += replace[i]; 245 } 246 } 247 248 size_t first = match[0].rm_so; 249 size_t count = match[0].rm_eo - match[0].rm_so; 250 251 str.replace(first, count, inserted); 252 } 253 254 255 void setup_regex(regular_expression_replace & regex, 256 string const & filename) 257 { 258 ifstream in(filename.c_str()); 259 if (!in) { 260 throw op_runtime_error("Can't open file " + filename + 261 " for reading", errno); 262 } 263 264 regular_expression_replace var_name_rule; 265 var_name_rule.add_pattern("^\\$([_a-zA-Z][_a-zA-Z0-9]*)[ ]*=.*", "\\1"); 266 regular_expression_replace var_value_rule; 267 var_value_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1"); 268 269 regular_expression_replace left_rule; 270 left_rule.add_pattern("[ ]*\"(.*)\"[ ]*=.*", "\\1"); 271 regular_expression_replace right_rule; 272 right_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1"); 273 274 string line; 275 while (getline(in, line)) { 276 line = trim(line); 277 if (line.empty() || line[0] == '#') 278 continue; 279 280 string temp = line; 281 var_name_rule.execute(temp); 282 if (temp == line) { 283 string left = line; 284 left_rule.execute(left); 285 if (left == line) { 286 throw bad_regex("invalid input file: \"" + line + '"'); 287 } 288 289 string right = line; 290 right_rule.execute(right); 291 if (right == line) { 292 throw bad_regex("invalid input file: \"" + line + '"'); 293 } 294 295 regex.add_pattern(left, right); 296 } else { 297 // temp != line ==> var_name_rule succeed to substitute 298 // into temp the var_name present in line 299 string var_name = temp; 300 string var_value = line; 301 var_value_rule.execute(var_value); 302 if (var_value == line) { 303 throw bad_regex("invalid input file: \"" + line + '"'); 304 } 305 306 regex.add_definition(var_name, var_value); 307 } 308 } 309 } 310