Home | History | Annotate | Download | only in bits
      1 // class template regex -*- C++ -*-
      2 
      3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /**
     26  *  @file bits/regex_executor.tcc
     27  *  This is an internal header file, included by other library headers.
     28  *  Do not attempt to use it directly. @headername{regex}
     29  */
     30 
     31 namespace std _GLIBCXX_VISIBILITY(default)
     32 {
     33 namespace __detail
     34 {
     35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     36 
     37   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     38     bool __dfs_mode>
     39     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
     40     _M_search()
     41     {
     42       if (_M_flags & regex_constants::match_continuous)
     43 	return _M_search_from_first();
     44       auto __cur = _M_begin;
     45       do
     46 	{
     47 	  _M_current = __cur;
     48 	  if (_M_main<false>())
     49 	    return true;
     50 	}
     51       // Continue when __cur == _M_end
     52       while (__cur++ != _M_end);
     53       return false;
     54     }
     55 
     56   // This function operates in different modes, DFS mode or BFS mode, indicated
     57   // by template parameter __dfs_mode. See _M_main for details.
     58   //
     59   // ------------------------------------------------------------
     60   //
     61   // DFS mode:
     62   //
     63   // It applies a Depth-First-Search (aka backtracking) on given NFA and input
     64   // string.
     65   // At the very beginning the executor stands in the start state, then it tries
     66   // every possible state transition in current state recursively. Some state
     67   // transitions consume input string, say, a single-char-matcher or a
     68   // back-reference matcher; some don't, like assertion or other anchor nodes.
     69   // When the input is exhausted and/or the current state is an accepting state,
     70   // the whole executor returns true.
     71   //
     72   // TODO: This approach is exponentially slow for certain input.
     73   //       Try to compile the NFA to a DFA.
     74   //
     75   // Time complexity: \Omega(match_length), O(2^(_M_nfa.size()))
     76   // Space complexity: \theta(match_results.size() + match_length)
     77   //
     78   // ------------------------------------------------------------
     79   //
     80   // BFS mode:
     81   //
     82   // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html)
     83   // explained this algorithm clearly.
     84   //
     85   // It first computes epsilon closure (states that can be achieved without
     86   // consuming characters) for every state that's still matching,
     87   // using the same DFS algorithm, but doesn't re-enter states (find a true in
     88   // _M_visited), nor follows _S_opcode_match.
     89   //
     90   // Then apply DFS using every _S_opcode_match (in _M_match_queue) as the start
     91   // state.
     92   //
     93   // It significantly reduces potential duplicate states, so has a better
     94   // upper bound; but it requires more overhead.
     95   //
     96   // Time complexity: \Omega(match_length * match_results.size())
     97   //                  O(match_length * _M_nfa.size() * match_results.size())
     98   // Space complexity: \Omega(_M_nfa.size() + match_results.size())
     99   //                   O(_M_nfa.size() * match_results.size())
    100   template<typename _BiIter, typename _Alloc, typename _TraitsT,
    101     bool __dfs_mode>
    102   template<bool __match_mode>
    103     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
    104     _M_main()
    105     {
    106       if (__dfs_mode)
    107 	{
    108 	  _M_has_sol = false;
    109 	  _M_cur_results = _M_results;
    110 	  _M_dfs<__match_mode>(_M_start_state);
    111 	  return _M_has_sol;
    112 	}
    113       else
    114 	{
    115 	  _M_match_queue->push_back(make_pair(_M_start_state, _M_results));
    116 	  bool __ret = false;
    117 	  while (1)
    118 	    {
    119 	      _M_has_sol = false;
    120 	      if (_M_match_queue->empty())
    121 		break;
    122 	      _M_visited->assign(_M_visited->size(), false);
    123 	      auto __old_queue = std::move(*_M_match_queue);
    124 	      for (auto& __task : __old_queue)
    125 		{
    126 		  _M_cur_results = std::move(__task.second);
    127 		  _M_dfs<__match_mode>(__task.first);
    128 		}
    129 	      if (!__match_mode)
    130 		__ret |= _M_has_sol;
    131 	      if (_M_current == _M_end)
    132 		break;
    133 	      ++_M_current;
    134 	    }
    135 	  if (__match_mode)
    136 	    __ret = _M_has_sol;
    137 	  return __ret;
    138 	}
    139     }
    140 
    141   // Return whether now match the given sub-NFA.
    142   template<typename _BiIter, typename _Alloc, typename _TraitsT,
    143     bool __dfs_mode>
    144     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
    145     _M_lookahead(_State<_TraitsT> __state)
    146     {
    147       _ResultsVec __what(_M_cur_results.size());
    148       auto __sub = std::unique_ptr<_Executor>(new _Executor(_M_current,
    149 							    _M_end,
    150 							    __what,
    151 							    _M_re,
    152 							    _M_flags));
    153       __sub->_M_start_state = __state._M_alt;
    154       if (__sub->_M_search_from_first())
    155 	{
    156 	  for (size_t __i = 0; __i < __what.size(); __i++)
    157 	    if (__what[__i].matched)
    158 	      _M_cur_results[__i] = __what[__i];
    159 	  return true;
    160 	}
    161       return false;
    162     }
    163 
    164   // TODO: Use a function vector to dispatch, instead of using switch-case.
    165   template<typename _BiIter, typename _Alloc, typename _TraitsT,
    166     bool __dfs_mode>
    167   template<bool __match_mode>
    168     void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
    169     _M_dfs(_StateIdT __i)
    170     {
    171       if (!__dfs_mode)
    172 	{
    173 	  if ((*_M_visited)[__i])
    174 	    return;
    175 	  (*_M_visited)[__i] = true;
    176 	}
    177 
    178       const auto& __state = _M_nfa[__i];
    179       // Every change on _M_cur_results and _M_current will be rolled back after
    180       // finishing the recursion step.
    181       switch (__state._M_opcode)
    182 	{
    183 	// _M_alt branch is "match once more", while _M_next is "get me out
    184 	// of this quantifier". Executing _M_next first or _M_alt first don't
    185 	// mean the same thing, and we need to choose the correct order under
    186 	// given greedy mode.
    187 	case _S_opcode_alternative:
    188 	  // Greedy.
    189 	  if (!__state._M_neg)
    190 	    {
    191 	      // "Once more" is preferred in greedy mode.
    192 	      _M_dfs<__match_mode>(__state._M_alt);
    193 	      // If it's DFS executor and already accepted, we're done.
    194 	      if (!__dfs_mode || !_M_has_sol)
    195 		_M_dfs<__match_mode>(__state._M_next);
    196 	    }
    197 	  else // Non-greedy mode
    198 	    {
    199 	      if (__dfs_mode)
    200 		{
    201 		  // vice-versa.
    202 		  _M_dfs<__match_mode>(__state._M_next);
    203 		  if (!_M_has_sol)
    204 		    _M_dfs<__match_mode>(__state._M_alt);
    205 		}
    206 	      else
    207 		{
    208 		  // DON'T attempt anything, because there's already another
    209 		  // state with higher priority accepted. This state cannot be
    210 		  // better by attempting its next node.
    211 		  if (!_M_has_sol)
    212 		    {
    213 		      _M_dfs<__match_mode>(__state._M_next);
    214 		      // DON'T attempt anything if it's already accepted. An
    215 		      // accepted state *must* be better than a solution that
    216 		      // matches a non-greedy quantifier one more time.
    217 		      if (!_M_has_sol)
    218 			_M_dfs<__match_mode>(__state._M_alt);
    219 		    }
    220 		}
    221 	    }
    222 	  break;
    223 	case _S_opcode_subexpr_begin:
    224 	  // If there's nothing changed since last visit, do NOT continue.
    225 	  // This prevents the executor from get into infinite loop when using
    226 	  // "()*" to match "".
    227 	  if (!_M_cur_results[__state._M_subexpr].matched
    228 	      || _M_cur_results[__state._M_subexpr].first != _M_current)
    229 	    {
    230 	      auto& __res = _M_cur_results[__state._M_subexpr];
    231 	      auto __back = __res.first;
    232 	      __res.first = _M_current;
    233 	      _M_dfs<__match_mode>(__state._M_next);
    234 	      __res.first = __back;
    235 	    }
    236 	  break;
    237 	case _S_opcode_subexpr_end:
    238 	  if (_M_cur_results[__state._M_subexpr].second != _M_current
    239 	      || _M_cur_results[__state._M_subexpr].matched != true)
    240 	    {
    241 	      auto& __res = _M_cur_results[__state._M_subexpr];
    242 	      auto __back = __res;
    243 	      __res.second = _M_current;
    244 	      __res.matched = true;
    245 	      _M_dfs<__match_mode>(__state._M_next);
    246 	      __res = __back;
    247 	    }
    248 	  else
    249 	    _M_dfs<__match_mode>(__state._M_next);
    250 	  break;
    251 	case _S_opcode_line_begin_assertion:
    252 	  if (_M_at_begin())
    253 	    _M_dfs<__match_mode>(__state._M_next);
    254 	  break;
    255 	case _S_opcode_line_end_assertion:
    256 	  if (_M_at_end())
    257 	    _M_dfs<__match_mode>(__state._M_next);
    258 	  break;
    259 	case _S_opcode_word_boundary:
    260 	  if (_M_word_boundary(__state) == !__state._M_neg)
    261 	    _M_dfs<__match_mode>(__state._M_next);
    262 	  break;
    263 	// Here __state._M_alt offers a single start node for a sub-NFA.
    264 	// We recursively invoke our algorithm to match the sub-NFA.
    265 	case _S_opcode_subexpr_lookahead:
    266 	  if (_M_lookahead(__state) == !__state._M_neg)
    267 	    _M_dfs<__match_mode>(__state._M_next);
    268 	  break;
    269 	case _S_opcode_match:
    270 	  if (__dfs_mode)
    271 	    {
    272 	      if (_M_current != _M_end && __state._M_matches(*_M_current))
    273 		{
    274 		  ++_M_current;
    275 		  _M_dfs<__match_mode>(__state._M_next);
    276 		  --_M_current;
    277 		}
    278 	    }
    279 	  else
    280 	    if (__state._M_matches(*_M_current))
    281 	      _M_match_queue->push_back(make_pair(__state._M_next,
    282 						  _M_cur_results));
    283 	  break;
    284 	// First fetch the matched result from _M_cur_results as __submatch;
    285 	// then compare it with
    286 	// (_M_current, _M_current + (__submatch.second - __submatch.first)).
    287 	// If matched, keep going; else just return and try another state.
    288 	case _S_opcode_backref:
    289 	  {
    290 	    _GLIBCXX_DEBUG_ASSERT(__dfs_mode);
    291 	    auto& __submatch = _M_cur_results[__state._M_backref_index];
    292 	    if (!__submatch.matched)
    293 	      break;
    294 	    auto __last = _M_current;
    295 	    for (auto __tmp = __submatch.first;
    296 		 __last != _M_end && __tmp != __submatch.second;
    297 		 ++__tmp)
    298 	      ++__last;
    299 	    if (_M_re._M_traits.transform(__submatch.first,
    300 						__submatch.second)
    301 		== _M_re._M_traits.transform(_M_current, __last))
    302 	      {
    303 		if (__last != _M_current)
    304 		  {
    305 		    auto __backup = _M_current;
    306 		    _M_current = __last;
    307 		    _M_dfs<__match_mode>(__state._M_next);
    308 		    _M_current = __backup;
    309 		  }
    310 		else
    311 		  _M_dfs<__match_mode>(__state._M_next);
    312 	      }
    313 	  }
    314 	  break;
    315 	case _S_opcode_accept:
    316 	  if (__dfs_mode)
    317 	    {
    318 	      _GLIBCXX_DEBUG_ASSERT(!_M_has_sol);
    319 	      if (__match_mode)
    320 		_M_has_sol = _M_current == _M_end;
    321 	      else
    322 		_M_has_sol = true;
    323 	      if (_M_current == _M_begin
    324 		  && (_M_flags & regex_constants::match_not_null))
    325 		_M_has_sol = false;
    326 	      if (_M_has_sol)
    327 		_M_results = _M_cur_results;
    328 	    }
    329 	  else
    330 	    {
    331 	      if (_M_current == _M_begin
    332 		  && (_M_flags & regex_constants::match_not_null))
    333 		break;
    334 	      if (!__match_mode || _M_current == _M_end)
    335 		if (!_M_has_sol)
    336 		  {
    337 		    _M_has_sol = true;
    338 		    _M_results = _M_cur_results;
    339 		  }
    340 	    }
    341 	  break;
    342 	default:
    343 	  _GLIBCXX_DEBUG_ASSERT(false);
    344 	}
    345     }
    346 
    347   // Return whether now is at some word boundary.
    348   template<typename _BiIter, typename _Alloc, typename _TraitsT,
    349     bool __dfs_mode>
    350     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
    351     _M_word_boundary(_State<_TraitsT> __state) const
    352     {
    353       // By definition.
    354       bool __ans = false;
    355       auto __pre = _M_current;
    356       --__pre;
    357       if (!(_M_at_begin() && _M_at_end()))
    358 	{
    359 	  if (_M_at_begin())
    360 	    __ans = _M_is_word(*_M_current)
    361 	      && !(_M_flags & regex_constants::match_not_bow);
    362 	  else if (_M_at_end())
    363 	    __ans = _M_is_word(*__pre)
    364 	      && !(_M_flags & regex_constants::match_not_eow);
    365 	  else
    366 	    __ans = _M_is_word(*_M_current)
    367 	      != _M_is_word(*__pre);
    368 	}
    369       return __ans;
    370     }
    371 
    372 _GLIBCXX_END_NAMESPACE_VERSION
    373 } // namespace __detail
    374 } // namespace
    375