Home | History | Annotate | Download | only in bits
      1 // class template regex -*- C++ -*-
      2 
      3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /**
     26  *  @file bits/regex_compiler.tcc
     27  *  This is an internal header file, included by other library headers.
     28  *  Do not attempt to use it directly. @headername{regex}
     29  */
     30 
     31 // FIXME make comments doxygen format.
     32 
     33 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
     34 // (http://swtch.com/~rsc/regexp/regexp1.html"),
     35 // but doesn't strictly follow it.
     36 //
     37 // When compiling, states are *chained* instead of tree- or graph-constructed.
     38 // It's more like structured programs: there's if statement and loop statement.
     39 //
     40 // For alternative structure(say "a|b"), aka "if statement", two branchs should
     41 // be constructed. However, these two shall merge to an "end_tag" at the end of
     42 // this operator:
     43 //
     44 //                branch1
     45 //              /        \
     46 // => begin_tag            end_tag =>
     47 //              \        /
     48 //                branch2
     49 //
     50 // This is the difference between this implementation and that in Russ's
     51 // article.
     52 //
     53 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
     54 // All dummy node will be eliminated at the end of compiling process.
     55 
     56 namespace std _GLIBCXX_VISIBILITY(default)
     57 {
     58 namespace __detail
     59 {
     60 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     61 
     62   template<typename _TraitsT>
     63     _Compiler<_TraitsT>::
     64     _Compiler(_IterT __b, _IterT __e,
     65 	      const _TraitsT& __traits, _FlagT __flags)
     66     : _M_flags((__flags
     67 		& (regex_constants::ECMAScript
     68 		   | regex_constants::basic
     69 		   | regex_constants::extended
     70 		   | regex_constants::grep
     71 		   | regex_constants::egrep
     72 		   | regex_constants::awk))
     73 	       ? __flags
     74 	       : __flags | regex_constants::ECMAScript),
     75     _M_traits(__traits),
     76     _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())),
     77     _M_scanner(__b, __e, _M_flags, _M_traits.getloc()),
     78     _M_nfa(_M_flags)
     79     {
     80       _StateSeqT __r(_M_nfa, _M_nfa._M_start());
     81       __r._M_append(_M_nfa._M_insert_subexpr_begin());
     82       this->_M_disjunction();
     83       if (!_M_match_token(_ScannerT::_S_token_eof))
     84 	__throw_regex_error(regex_constants::error_paren);
     85       __r._M_append(_M_pop());
     86       _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
     87       __r._M_append(_M_nfa._M_insert_subexpr_end());
     88       __r._M_append(_M_nfa._M_insert_accept());
     89       _M_nfa._M_eliminate_dummy();
     90     }
     91 
     92   template<typename _TraitsT>
     93     void
     94     _Compiler<_TraitsT>::
     95     _M_disjunction()
     96     {
     97       this->_M_alternative();
     98       while (_M_match_token(_ScannerT::_S_token_or))
     99 	{
    100 	  _StateSeqT __alt1 = _M_pop();
    101 	  this->_M_alternative();
    102 	  _StateSeqT __alt2 = _M_pop();
    103 	  auto __end = _M_nfa._M_insert_dummy();
    104 	  __alt1._M_append(__end);
    105 	  __alt2._M_append(__end);
    106 	  _M_stack.push(_StateSeqT(_M_nfa,
    107 				   _M_nfa._M_insert_alt(__alt1._M_start,
    108 							__alt2._M_start, false),
    109 				   __end));
    110 	}
    111     }
    112 
    113   template<typename _TraitsT>
    114     void
    115     _Compiler<_TraitsT>::
    116     _M_alternative()
    117     {
    118       if (this->_M_term())
    119 	{
    120 	  _StateSeqT __re = _M_pop();
    121 	  this->_M_alternative();
    122 	  __re._M_append(_M_pop());
    123 	  _M_stack.push(__re);
    124 	}
    125       else
    126 	_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
    127     }
    128 
    129   template<typename _TraitsT>
    130     bool
    131     _Compiler<_TraitsT>::
    132     _M_term()
    133     {
    134       if (this->_M_assertion())
    135 	return true;
    136       if (this->_M_atom())
    137 	{
    138 	  while (this->_M_quantifier());
    139 	  return true;
    140 	}
    141       return false;
    142     }
    143 
    144   template<typename _TraitsT>
    145     bool
    146     _Compiler<_TraitsT>::
    147     _M_assertion()
    148     {
    149       if (_M_match_token(_ScannerT::_S_token_line_begin))
    150 	_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin()));
    151       else if (_M_match_token(_ScannerT::_S_token_line_end))
    152 	_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end()));
    153       else if (_M_match_token(_ScannerT::_S_token_word_bound))
    154 	// _M_value[0] == 'n' means it's negtive, say "not word boundary".
    155 	_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
    156 	      _M_insert_word_bound(_M_value[0] == 'n')));
    157       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
    158 	{
    159 	  auto __neg = _M_value[0] == 'n';
    160 	  this->_M_disjunction();
    161 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
    162 	    __throw_regex_error(regex_constants::error_paren);
    163 	  auto __tmp = _M_pop();
    164 	  __tmp._M_append(_M_nfa._M_insert_accept());
    165 	  _M_stack.push(
    166 	      _StateSeqT(
    167 		_M_nfa,
    168 		_M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
    169 	}
    170       else
    171 	return false;
    172       return true;
    173     }
    174 
    175   template<typename _TraitsT>
    176     bool
    177     _Compiler<_TraitsT>::
    178     _M_quantifier()
    179     {
    180       bool __neg = (_M_flags & regex_constants::ECMAScript);
    181       auto __init = [this, &__neg]()
    182 	{
    183 	  if (_M_stack.empty())
    184 	    __throw_regex_error(regex_constants::error_badrepeat);
    185 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
    186 	};
    187       if (_M_match_token(_ScannerT::_S_token_closure0))
    188 	{
    189 	  __init();
    190 	  auto __e = _M_pop();
    191 	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
    192 						      __e._M_start, __neg));
    193 	  __e._M_append(__r);
    194 	  _M_stack.push(__r);
    195 	}
    196       else if (_M_match_token(_ScannerT::_S_token_closure1))
    197 	{
    198 	  __init();
    199 	  auto __e = _M_pop();
    200 	  __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
    201 					     __neg));
    202 	  _M_stack.push(__e);
    203 	}
    204       else if (_M_match_token(_ScannerT::_S_token_opt))
    205 	{
    206 	  __init();
    207 	  auto __e = _M_pop();
    208 	  auto __end = _M_nfa._M_insert_dummy();
    209 	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
    210 						      __e._M_start, __neg));
    211 	  __e._M_append(__end);
    212 	  __r._M_append(__end);
    213 	  _M_stack.push(__r);
    214 	}
    215       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
    216 	{
    217 	  if (_M_stack.empty())
    218 	    __throw_regex_error(regex_constants::error_badrepeat);
    219 	  if (!_M_match_token(_ScannerT::_S_token_dup_count))
    220 	    __throw_regex_error(regex_constants::error_badbrace);
    221 	  _StateSeqT __r(_M_pop());
    222 	  _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy());
    223 	  long __min_rep = _M_cur_int_value(10);
    224 	  bool __infi = false;
    225 	  long __n;
    226 
    227 	  // {3
    228 	  if (_M_match_token(_ScannerT::_S_token_comma))
    229 	    if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
    230 	      __n = _M_cur_int_value(10) - __min_rep;
    231 	    else
    232 	      __infi = true;
    233 	  else
    234 	    __n = 0;
    235 	  if (!_M_match_token(_ScannerT::_S_token_interval_end))
    236 	    __throw_regex_error(regex_constants::error_brace);
    237 
    238 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
    239 
    240 	  for (long __i = 0; __i < __min_rep; ++__i)
    241 	    __e._M_append(__r._M_clone());
    242 
    243 	  if (__infi)
    244 	    {
    245 	      auto __tmp = __r._M_clone();
    246 	      _StateSeqT __s(_M_nfa,
    247 			     _M_nfa._M_insert_alt(_S_invalid_state_id,
    248 						  __tmp._M_start, __neg));
    249 	      __tmp._M_append(__s);
    250 	      __e._M_append(__s);
    251 	    }
    252 	  else
    253 	    {
    254 	      if (__n < 0)
    255 		__throw_regex_error(regex_constants::error_badbrace);
    256 	      auto __end = _M_nfa._M_insert_dummy();
    257 	      // _M_alt is the "match more" branch, and _M_next is the
    258 	      // "match less" one. Switch _M_alt and _M_next of all created
    259 	      // nodes. This is a hacking but IMO works well.
    260 	      std::stack<_StateIdT> __stack;
    261 	      for (long __i = 0; __i < __n; ++__i)
    262 		{
    263 		  auto __tmp = __r._M_clone();
    264 		  auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
    265 						    __end, __neg);
    266 		  __stack.push(__alt);
    267 		  __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
    268 		}
    269 	      __e._M_append(__end);
    270 	      while (!__stack.empty())
    271 		{
    272 		  auto& __tmp = _M_nfa[__stack.top()];
    273 		  __stack.pop();
    274 		  swap(__tmp._M_next, __tmp._M_alt);
    275 		}
    276 	    }
    277 	  _M_stack.push(__e);
    278 	}
    279       else
    280 	return false;
    281       return true;
    282     }
    283 
    284 #define __INSERT_REGEX_MATCHER(__func, args...)\
    285 	do\
    286 	  if (!(_M_flags & regex_constants::icase))\
    287 	    if (!(_M_flags & regex_constants::collate))\
    288 	      __func<false, false>(args);\
    289 	    else\
    290 	      __func<false, true>(args);\
    291 	  else\
    292 	    if (!(_M_flags & regex_constants::collate))\
    293 	      __func<true, false>(args);\
    294 	    else\
    295 	      __func<true, true>(args);\
    296 	while (false)
    297 
    298   template<typename _TraitsT>
    299     bool
    300     _Compiler<_TraitsT>::
    301     _M_atom()
    302     {
    303       if (_M_match_token(_ScannerT::_S_token_anychar))
    304 	{
    305 	  if (!(_M_flags & regex_constants::ECMAScript))
    306 	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
    307 	  else
    308 	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
    309 	}
    310       else if (_M_try_char())
    311 	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
    312       else if (_M_match_token(_ScannerT::_S_token_backref))
    313 	_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
    314 				 _M_insert_backref(_M_cur_int_value(10))));
    315       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
    316 	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
    317       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
    318 	{
    319 	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy());
    320 	  this->_M_disjunction();
    321 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
    322 	    __throw_regex_error(regex_constants::error_paren);
    323 	  __r._M_append(_M_pop());
    324 	  _M_stack.push(__r);
    325 	}
    326       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
    327 	{
    328 	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin());
    329 	  this->_M_disjunction();
    330 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
    331 	    __throw_regex_error(regex_constants::error_paren);
    332 	  __r._M_append(_M_pop());
    333 	  __r._M_append(_M_nfa._M_insert_subexpr_end());
    334 	  _M_stack.push(__r);
    335 	}
    336       else if (!_M_bracket_expression())
    337 	return false;
    338       return true;
    339     }
    340 
    341   template<typename _TraitsT>
    342     bool
    343     _Compiler<_TraitsT>::
    344     _M_bracket_expression()
    345     {
    346       bool __neg =
    347 	_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
    348       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
    349 	return false;
    350       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
    351       return true;
    352     }
    353 #undef __INSERT_REGEX_MATCHER
    354 
    355   template<typename _TraitsT>
    356   template<bool __icase, bool __collate>
    357     void
    358     _Compiler<_TraitsT>::
    359     _M_insert_any_matcher_ecma()
    360     {
    361       _M_stack.push(_StateSeqT(_M_nfa,
    362 	_M_nfa._M_insert_matcher
    363 	  (_AnyMatcher<_TraitsT, true, __icase, __collate>
    364 	    (_M_traits))));
    365     }
    366 
    367   template<typename _TraitsT>
    368   template<bool __icase, bool __collate>
    369     void
    370     _Compiler<_TraitsT>::
    371     _M_insert_any_matcher_posix()
    372     {
    373       _M_stack.push(_StateSeqT(_M_nfa,
    374 	_M_nfa._M_insert_matcher
    375 	  (_AnyMatcher<_TraitsT, false, __icase, __collate>
    376 	    (_M_traits))));
    377     }
    378 
    379   template<typename _TraitsT>
    380   template<bool __icase, bool __collate>
    381     void
    382     _Compiler<_TraitsT>::
    383     _M_insert_char_matcher()
    384     {
    385       _M_stack.push(_StateSeqT(_M_nfa,
    386 	_M_nfa._M_insert_matcher
    387 	  (_CharMatcher<_TraitsT, __icase, __collate>
    388 	    (_M_value[0], _M_traits))));
    389     }
    390 
    391   template<typename _TraitsT>
    392   template<bool __icase, bool __collate>
    393     void
    394     _Compiler<_TraitsT>::
    395     _M_insert_character_class_matcher()
    396     {
    397       _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
    398       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
    399 	(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
    400       __matcher._M_add_character_class(_M_value, false);
    401       __matcher._M_ready();
    402       _M_stack.push(_StateSeqT(_M_nfa,
    403 	_M_nfa._M_insert_matcher(std::move(__matcher))));
    404     }
    405 
    406   template<typename _TraitsT>
    407   template<bool __icase, bool __collate>
    408     void
    409     _Compiler<_TraitsT>::
    410     _M_insert_bracket_matcher(bool __neg)
    411     {
    412       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
    413       while (!_M_match_token(_ScannerT::_S_token_bracket_end))
    414 	_M_expression_term(__matcher);
    415       __matcher._M_ready();
    416       _M_stack.push(_StateSeqT(_M_nfa,
    417 			       _M_nfa._M_insert_matcher(std::move(__matcher))));
    418     }
    419 
    420   template<typename _TraitsT>
    421   template<bool __icase, bool __collate>
    422     void
    423     _Compiler<_TraitsT>::
    424     _M_expression_term(_BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
    425     {
    426       if (_M_match_token(_ScannerT::_S_token_collsymbol))
    427 	__matcher._M_add_collating_element(_M_value);
    428       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
    429 	__matcher._M_add_equivalence_class(_M_value);
    430       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
    431 	__matcher._M_add_character_class(_M_value, false);
    432       else if (_M_try_char()) // [a
    433 	{
    434 	  auto __ch = _M_value[0];
    435 	  if (_M_try_char())
    436 	    {
    437 	      if (_M_value[0] == '-') // [a-
    438 		{
    439 		  if (_M_try_char()) // [a-z]
    440 		    {
    441 		      __matcher._M_make_range(__ch, _M_value[0]);
    442 		      return;
    443 		    }
    444 		  // If the dash is the last character in the bracket
    445 		  // expression, it is not special.
    446 		  if (_M_scanner._M_get_token()
    447 		      != _ScannerT::_S_token_bracket_end)
    448 		    __throw_regex_error(regex_constants::error_range);
    449 		}
    450 	      __matcher._M_add_char(_M_value[0]);
    451 	    }
    452 	  __matcher._M_add_char(__ch);
    453 	}
    454       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
    455 	__matcher._M_add_character_class(_M_value,
    456 					 _M_ctype.is(_CtypeT::upper,
    457 						     _M_value[0]));
    458       else
    459 	__throw_regex_error(regex_constants::error_brack);
    460     }
    461 
    462   template<typename _TraitsT>
    463     bool
    464     _Compiler<_TraitsT>::
    465     _M_try_char()
    466     {
    467       bool __is_char = false;
    468       if (_M_match_token(_ScannerT::_S_token_oct_num))
    469 	{
    470 	  __is_char = true;
    471 	  _M_value.assign(1, _M_cur_int_value(8));
    472 	}
    473       else if (_M_match_token(_ScannerT::_S_token_hex_num))
    474 	{
    475 	  __is_char = true;
    476 	  _M_value.assign(1, _M_cur_int_value(16));
    477 	}
    478       else if (_M_match_token(_ScannerT::_S_token_ord_char))
    479 	__is_char = true;
    480       return __is_char;
    481     }
    482 
    483   template<typename _TraitsT>
    484     bool
    485     _Compiler<_TraitsT>::
    486     _M_match_token(_TokenT token)
    487     {
    488       if (token == _M_scanner._M_get_token())
    489 	{
    490 	  _M_value = _M_scanner._M_get_value();
    491 	  _M_scanner._M_advance();
    492 	  return true;
    493 	}
    494       return false;
    495     }
    496 
    497   template<typename _TraitsT>
    498     int
    499     _Compiler<_TraitsT>::
    500     _M_cur_int_value(int __radix)
    501     {
    502       long __v = 0;
    503       for (typename _StringT::size_type __i = 0;
    504 	   __i < _M_value.length(); ++__i)
    505 	__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
    506       return __v;
    507     }
    508 
    509   template<typename _TraitsT, bool __icase, bool __collate>
    510     bool
    511     _BracketMatcher<_TraitsT, __icase, __collate>::
    512     _M_apply(_CharT __ch, false_type) const
    513     {
    514       bool __ret = false;
    515       if (std::find(_M_char_set.begin(), _M_char_set.end(),
    516 		    _M_translator._M_translate(__ch))
    517 	  != _M_char_set.end())
    518 	__ret = true;
    519       else
    520 	{
    521 	  auto __s = _M_translator._M_transform(__ch);
    522 	  for (auto& __it : _M_range_set)
    523 	    if (__it.first <= __s && __s <= __it.second)
    524 	      {
    525 		__ret = true;
    526 		break;
    527 	      }
    528 	  if (_M_traits.isctype(__ch, _M_class_set))
    529 	    __ret = true;
    530 	  else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
    531 			     _M_traits.transform_primary(&__ch, &__ch+1))
    532 		   != _M_equiv_set.end())
    533 	    __ret = true;
    534 	  else
    535 	    {
    536 	      for (auto& __it : _M_neg_class_set)
    537 		if (!_M_traits.isctype(__ch, __it))
    538 		  {
    539 		    __ret = true;
    540 		    break;
    541 		  }
    542 	    }
    543 	}
    544       if (_M_is_non_matching)
    545 	return !__ret;
    546       else
    547 	return __ret;
    548     }
    549 
    550 _GLIBCXX_END_NAMESPACE_VERSION
    551 } // namespace __detail
    552 } // namespace
    553