Home | History | Annotate | Download | only in libregex
      1 /**
      2  * @file op_regex.cpp
      3  * This file contains implementation for a lightweight wrapper around
      4  * libc regex, providing regular expression match and replace facility.
      5  *
      6  * @remark Copyright 2003 OProfile authors
      7  * @remark Read the file COPYING
      8  * @remark Idea comes from TextFilt project <http://textfilt.sourceforge.net>
      9  *
     10  * @author Philippe Elie
     11  */
     12 
     13 #include <cerrno>
     14 
     15 #include <iostream>
     16 #include <fstream>
     17 
     18 #include "string_manip.h"
     19 
     20 #include "op_regex.h"
     21 
     22 using namespace std;
     23 
     24 namespace {
     25 
     26 string op_regerror(int err, regex_t const & regexp)
     27 {
     28 	size_t needed_size = regerror(err, &regexp, 0, 0);
     29 	char * buffer = new char[needed_size];
     30 	regerror(err, &regexp, buffer, needed_size);
     31 
     32 	return buffer;
     33 }
     34 
     35 
     36 void op_regcomp(regex_t & regexp, string const & pattern)
     37 {
     38 	int err = regcomp(&regexp, pattern.c_str(), REG_EXTENDED);
     39 	if (err) {
     40 		throw bad_regex("regcomp error: " + op_regerror(err, regexp)
     41 				+ " for pattern : " + pattern);
     42 	}
     43 }
     44 
     45 
     46 bool op_regexec(regex_t const & regex, string const & str, regmatch_t * match,
     47 	       size_t nmatch)
     48 {
     49 	return regexec(&regex, str.c_str(), nmatch, match, 0) != REG_NOMATCH;
     50 }
     51 
     52 
     53 void op_regfree(regex_t & regexp)
     54 {
     55 	regfree(&regexp);
     56 }
     57 
     58 
     59 // return the index number associated with a char seen in a "\x".
     60 // Allowed range are for x is [0-9a-z] return size_t(-1) if x is not in
     61 // these ranges.
     62 size_t subexpr_index(char ch)
     63 {
     64 	if (isdigit(ch))
     65 		return ch - '0';
     66 	if (ch >= 'a' && ch <= 'z')
     67 		return ch - 'a' + 10;
     68 	return size_t(-1);
     69 }
     70 
     71 }  // anonymous namespace
     72 
     73 
     74 bad_regex::bad_regex(string const & pattern)
     75 	: op_exception(pattern)
     76 {
     77 }
     78 
     79 
     80 regular_expression_replace::regular_expression_replace(size_t limit_,
     81 						       size_t limit_defs)
     82 	:
     83 	limit(limit_),
     84 	limit_defs_expansion(limit_defs)
     85 {
     86 }
     87 
     88 
     89 regular_expression_replace::~regular_expression_replace()
     90 {
     91 	for (size_t i = 0 ; i < regex_replace.size() ; ++i)
     92 		op_regfree(regex_replace[i].regexp);
     93 }
     94 
     95 
     96 void regular_expression_replace::add_definition(string const & name,
     97 						string const & definition)
     98 {
     99 	defs[name] = expand_string(definition);
    100 }
    101 
    102 
    103 void regular_expression_replace::add_pattern(string const & pattern,
    104 					     string const & replace)
    105 {
    106 	string expanded_pattern = expand_string(pattern);
    107 
    108 	regex_t regexp;
    109 	op_regcomp(regexp, expanded_pattern);
    110 	replace_t regex = { regexp, replace };
    111 	regex_replace.push_back(regex);
    112 }
    113 
    114 
    115 string regular_expression_replace::expand_string(string const & input)
    116 {
    117 	string last, expanded(input);
    118 	size_t i = 0;
    119 	for (i = 0 ; i < limit_defs_expansion ; ++i) {
    120 		last = expanded;
    121 		expanded = substitute_definition(last);
    122 		if (expanded == last)
    123 			break;
    124 	}
    125 
    126 	if (i == limit_defs_expansion)
    127 		throw bad_regex("too many substitution for: + input");
    128 
    129 	return last;
    130 }
    131 
    132 
    133 string regular_expression_replace::substitute_definition(string const & pattern)
    134 {
    135 	string result;
    136 	bool previous_is_escape = false;
    137 
    138 	for (size_t i = 0 ; i < pattern.length() ; ++i) {
    139 		if (pattern[i] == '$' && !previous_is_escape) {
    140 			size_t pos = pattern.find('{', i);
    141 			if (pos != i + 1) {
    142 				throw bad_regex("invalid $ in pattern: " + pattern);
    143 			}
    144 			size_t end = pattern.find('}', i);
    145 			if (end == string::npos) {
    146 				throw bad_regex("no matching '}' in pattern: " + pattern);
    147 			}
    148 			string def_name = pattern.substr(pos+1, (end-pos) - 1);
    149 			if (defs.find(def_name) == defs.end()) {
    150 				throw bad_regex("definition not found and used in pattern: ("
    151 						+ def_name + ") " + pattern);
    152 			}
    153 			result += defs[def_name];
    154 			i = end;
    155 		} else {
    156 			if (pattern[i] == '\\' && !previous_is_escape)
    157 				previous_is_escape = true;
    158 			else
    159 				previous_is_escape = false;
    160 			result += pattern[i];
    161 		}
    162 	}
    163 
    164 	return result;
    165 }
    166 
    167 
    168 // FIXME limit output string size ? (cause we can have exponential growing
    169 // of output string through a rule "a" = "aa")
    170 bool regular_expression_replace::execute(string & str) const
    171 {
    172 	bool changed = true;
    173 	for (size_t nr_iter = 0; changed && nr_iter < limit ; ++nr_iter) {
    174 		changed = false;
    175 		for (size_t i = 0 ; i < regex_replace.size() ; ++i) {
    176 			if (do_execute(str, regex_replace[i]))
    177 				changed = true;
    178 		}
    179 	}
    180 
    181 	// this don't return if the input string has been changed but if
    182 	// we reach the limit number of iteration.
    183 	return changed == false;
    184 }
    185 
    186 
    187 bool regular_expression_replace::do_execute(string & str,
    188                                             replace_t const & regexp) const
    189 {
    190 	bool changed = false;
    191 
    192 	regmatch_t match[max_match];
    193 	for (size_t iter = 0;
    194 	     op_regexec(regexp.regexp, str, match, max_match) && iter < limit;
    195 	     iter++) {
    196 		changed = true;
    197 		do_replace(str, regexp.replace, match);
    198 	}
    199 
    200 	return changed;
    201 }
    202 
    203 
    204 regmatch_t const &
    205 regular_expression_replace::get_match(regmatch_t const * match, char idx) const
    206 {
    207 	size_t sub_expr = subexpr_index(idx);
    208 	if (sub_expr == size_t(-1))
    209 		throw bad_regex("expect group index: " + idx);
    210 	if (sub_expr >= max_match)
    211 		throw bad_regex("illegal group index :" + idx);
    212 	return match[sub_expr];
    213 }
    214 
    215 void regular_expression_replace::do_replace
    216 (string & str, string const & replace, regmatch_t const * match) const
    217 {
    218 	string inserted;
    219 	for (size_t i = 0 ; i < replace.length() ; ++i) {
    220 		if (replace[i] == '\\') {
    221 			if (i == replace.length() - 1) {
    222 				throw bad_regex("illegal \\ trailer: " +
    223 				                replace);
    224 			}
    225 			++i;
    226 			if (replace[i] == '\\') {
    227 				inserted += '\\';
    228 			}  else {
    229 				regmatch_t const & matched = get_match(match,
    230 					replace[i]);
    231 				if (matched.rm_so == -1 &&
    232 				    matched.rm_eo == -1) {
    233 					// empty match: nothing todo
    234 				} else if (matched.rm_so == -1 ||
    235 					   matched.rm_eo == -1) {
    236 					throw bad_regex("illegal match: " +
    237 						replace);
    238 				} else {
    239 					inserted += str.substr(matched.rm_so,
    240 					    matched.rm_eo - matched.rm_so);
    241 				}
    242 			}
    243 		} else {
    244 			inserted += replace[i];
    245 		}
    246 	}
    247 
    248 	size_t first = match[0].rm_so;
    249 	size_t count = match[0].rm_eo - match[0].rm_so;
    250 
    251 	str.replace(first, count, inserted);
    252 }
    253 
    254 
    255 void setup_regex(regular_expression_replace & regex,
    256                  string const & filename)
    257 {
    258 	ifstream in(filename.c_str());
    259 	if (!in) {
    260 		throw op_runtime_error("Can't open file " + filename +
    261 				" for reading", errno);
    262 	}
    263 
    264 	regular_expression_replace var_name_rule;
    265 	var_name_rule.add_pattern("^\\$([_a-zA-Z][_a-zA-Z0-9]*)[ ]*=.*", "\\1");
    266 	regular_expression_replace var_value_rule;
    267 	var_value_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
    268 
    269 	regular_expression_replace left_rule;
    270 	left_rule.add_pattern("[ ]*\"(.*)\"[ ]*=.*", "\\1");
    271 	regular_expression_replace right_rule;
    272 	right_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
    273 
    274 	string line;
    275 	while (getline(in, line)) {
    276 		line = trim(line);
    277 		if (line.empty() || line[0] == '#')
    278 			continue;
    279 
    280 		string temp = line;
    281 		var_name_rule.execute(temp);
    282 		if (temp == line) {
    283 			string left = line;
    284 			left_rule.execute(left);
    285 			if (left == line) {
    286 				throw bad_regex("invalid input file: \"" + line + '"');
    287 			}
    288 
    289 			string right = line;
    290 			right_rule.execute(right);
    291 			if (right == line) {
    292 				throw bad_regex("invalid input file: \"" + line + '"');
    293 			}
    294 
    295 			regex.add_pattern(left, right);
    296 		} else {
    297 			// temp != line ==> var_name_rule succeed to substitute
    298 			// into temp the var_name present in line
    299 			string var_name = temp;
    300 			string var_value = line;
    301 			var_value_rule.execute(var_value);
    302 			if (var_value == line) {
    303 				throw bad_regex("invalid input file: \"" + line + '"');
    304 			}
    305 
    306 			regex.add_definition(var_name, var_value);
    307 		}
    308 	}
    309 }
    310