Home | History | Annotate | Download | only in bits
      1 // class template regex -*- C++ -*-
      2 
      3 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /**
     26  *  @file bits/regex_compiler.h
     27  *  This is an internal header file, included by other library headers.
     28  *  Do not attempt to use it directly. @headername{regex}
     29  */
     30 
     31 namespace std _GLIBCXX_VISIBILITY(default)
     32 {
     33 namespace __regex
     34 {
     35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     36 
     37   struct _Scanner_base
     38   {
     39     typedef unsigned int _StateT;
     40 
     41     static constexpr _StateT _S_state_at_start    = 1 << 0;
     42     static constexpr _StateT _S_state_in_brace    = 1 << 2;
     43     static constexpr _StateT _S_state_in_bracket  = 1 << 3;
     44 
     45     virtual ~_Scanner_base() { };
     46   };
     47 
     48   //
     49   // @brief Scans an input range for regex tokens.
     50   //
     51   // The %_Scanner class interprets the regular expression pattern in the input
     52   // range passed to its constructor as a sequence of parse tokens passed to
     53   // the regular expression compiler.  The sequence of tokens provided depends
     54   // on the flag settings passed to the constructor:  different regular
     55   // expression grammars will interpret the same input pattern in
     56   // syntactically different ways.
     57   //
     58   template<typename _InputIterator>
     59     class _Scanner: public _Scanner_base
     60     {
     61     public:
     62       typedef _InputIterator                                        _IteratorT;
     63       typedef typename std::iterator_traits<_IteratorT>::value_type _CharT;
     64       typedef std::basic_string<_CharT>                             _StringT;
     65       typedef regex_constants::syntax_option_type                   _FlagT;
     66       typedef const std::ctype<_CharT>                              _CtypeT;
     67 
     68       // Token types returned from the scanner.
     69       enum _TokenT
     70       {
     71 	_S_token_anychar,
     72 	_S_token_backref,
     73 	_S_token_bracket_begin,
     74 	_S_token_bracket_end,
     75 	_S_token_inverse_class,
     76 	_S_token_char_class_name,
     77 	_S_token_closure0,
     78 	_S_token_closure1,
     79 	_S_token_collelem_multi,
     80 	_S_token_collelem_single,
     81 	_S_token_collsymbol,
     82 	_S_token_comma,
     83 	_S_token_dash,
     84 	_S_token_dup_count,
     85 	_S_token_eof,
     86 	_S_token_equiv_class_name,
     87 	_S_token_interval_begin,
     88 	_S_token_interval_end,
     89 	_S_token_line_begin,
     90 	_S_token_line_end,
     91 	_S_token_opt,
     92 	_S_token_or,
     93 	_S_token_ord_char,
     94 	_S_token_quoted_char,
     95 	_S_token_subexpr_begin,
     96 	_S_token_subexpr_end,
     97 	_S_token_word_begin,
     98 	_S_token_word_end,
     99 	_S_token_unknown
    100       };
    101 
    102     public:
    103       _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags,
    104 	       std::locale __loc)
    105       : _M_current(__begin) , _M_end(__end) , _M_flags(__flags),
    106         _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start)
    107       { _M_advance(); }
    108 
    109       void
    110       _M_advance();
    111 
    112       _TokenT
    113       _M_token() const
    114       { return _M_curToken; }
    115 
    116       const _StringT&
    117       _M_value() const
    118       { return _M_curValue; }
    119 
    120 #ifdef _GLIBCXX_DEBUG
    121       std::ostream&
    122       _M_print(std::ostream&);
    123 #endif
    124 
    125     private:
    126       void
    127       _M_eat_escape();
    128 
    129       void
    130       _M_scan_in_brace();
    131 
    132       void
    133       _M_scan_in_bracket();
    134 
    135       void
    136       _M_eat_charclass();
    137 
    138       void
    139       _M_eat_equivclass();
    140 
    141       void
    142       _M_eat_collsymbol();
    143 
    144     private:
    145       _IteratorT  _M_current;
    146       _IteratorT  _M_end;
    147       _FlagT      _M_flags;
    148       _CtypeT&    _M_ctype;
    149       _TokenT     _M_curToken;
    150       _StringT    _M_curValue;
    151       _StateT     _M_state;
    152     };
    153 
    154   template<typename _InputIterator>
    155     void
    156     _Scanner<_InputIterator>::
    157     _M_advance()
    158     {
    159       if (_M_current == _M_end)
    160 	{
    161 	  _M_curToken = _S_token_eof;
    162 	  return;
    163 	}
    164 
    165       _CharT __c = *_M_current;
    166       if (_M_state & _S_state_in_bracket)
    167 	{
    168 	  _M_scan_in_bracket();
    169 	  return;
    170 	}
    171       if (_M_state & _S_state_in_brace)
    172 	{
    173 	  _M_scan_in_brace();
    174 	  return;
    175 	}
    176 #if 0
    177       // TODO: re-enable line anchors when _M_assertion is implemented.
    178       // See PR libstdc++/47724
    179       else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^'))
    180 	{
    181 	  _M_curToken = _S_token_line_begin;
    182 	  ++_M_current;
    183 	  return;
    184 	}
    185       else if (__c == _M_ctype.widen('$'))
    186 	{
    187 	  _M_curToken = _S_token_line_end;
    188 	  ++_M_current;
    189 	  return;
    190 	}
    191 #endif
    192       else if (__c == _M_ctype.widen('.'))
    193 	{
    194 	  _M_curToken = _S_token_anychar;
    195 	  ++_M_current;
    196 	  return;
    197 	}
    198       else if (__c == _M_ctype.widen('*'))
    199 	{
    200 	  _M_curToken = _S_token_closure0;
    201 	  ++_M_current;
    202 	  return;
    203 	}
    204       else if (__c == _M_ctype.widen('+'))
    205 	{
    206 	  _M_curToken = _S_token_closure1;
    207 	  ++_M_current;
    208 	  return;
    209 	}
    210       else if (__c == _M_ctype.widen('|'))
    211 	{
    212 	  _M_curToken = _S_token_or;
    213 	  ++_M_current;
    214 	  return;
    215 	}
    216       else if (__c == _M_ctype.widen('['))
    217 	{
    218 	  _M_curToken = _S_token_bracket_begin;
    219 	  _M_state |= (_S_state_in_bracket | _S_state_at_start);
    220 	  ++_M_current;
    221 	  return;
    222 	}
    223       else if (__c == _M_ctype.widen('\\'))
    224 	{
    225 	  _M_eat_escape();
    226 	  return;
    227 	}
    228       else if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
    229 	{
    230 	  if (__c == _M_ctype.widen('('))
    231 	    {
    232 	      _M_curToken = _S_token_subexpr_begin;
    233 	      ++_M_current;
    234 	      return;
    235 	    }
    236 	  else if (__c == _M_ctype.widen(')'))
    237 	    {
    238 	      _M_curToken = _S_token_subexpr_end;
    239 	      ++_M_current;
    240 	      return;
    241 	    }
    242 	  else if (__c == _M_ctype.widen('{'))
    243 	    {
    244 	      _M_curToken = _S_token_interval_begin;
    245 	      _M_state |= _S_state_in_brace;
    246 	      ++_M_current;
    247 	      return;
    248 	    }
    249 	}
    250 
    251       _M_curToken = _S_token_ord_char;
    252       _M_curValue.assign(1, __c);
    253       ++_M_current;
    254     }
    255 
    256 
    257   template<typename _InputIterator>
    258     void
    259     _Scanner<_InputIterator>::
    260     _M_scan_in_brace()
    261     {
    262       if (_M_ctype.is(_CtypeT::digit, *_M_current))
    263 	{
    264 	  _M_curToken = _S_token_dup_count;
    265 	  _M_curValue.assign(1, *_M_current);
    266 	  ++_M_current;
    267 	  while (_M_current != _M_end
    268 		 && _M_ctype.is(_CtypeT::digit, *_M_current))
    269 	    {
    270 	      _M_curValue += *_M_current;
    271 	      ++_M_current;
    272 	    }
    273 	  return;
    274 	}
    275       else if (*_M_current == _M_ctype.widen(','))
    276 	{
    277 	  _M_curToken = _S_token_comma;
    278 	  ++_M_current;
    279 	  return;
    280 	}
    281       if (_M_flags & (regex_constants::basic | regex_constants::grep))
    282 	{
    283 	  if (*_M_current == _M_ctype.widen('\\'))
    284 	    _M_eat_escape();
    285 	}
    286       else
    287 	{
    288 	  if (*_M_current == _M_ctype.widen('}'))
    289 	    {
    290 	      _M_curToken = _S_token_interval_end;
    291 	      _M_state &= ~_S_state_in_brace;
    292 	      ++_M_current;
    293 	      return;
    294 	    }
    295 	}
    296     }
    297 
    298   template<typename _InputIterator>
    299     void
    300     _Scanner<_InputIterator>::
    301     _M_scan_in_bracket()
    302     {
    303       if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^'))
    304 	{
    305 	  _M_curToken = _S_token_inverse_class;
    306 	  _M_state &= ~_S_state_at_start;
    307 	  ++_M_current;
    308 	  return;
    309 	}
    310       else if (*_M_current == _M_ctype.widen('['))
    311 	{
    312 	  ++_M_current;
    313 	  if (_M_current == _M_end)
    314 	    {
    315 	      _M_curToken = _S_token_eof;
    316 	      return;
    317 	    }
    318 
    319 	  if (*_M_current == _M_ctype.widen('.'))
    320 	    {
    321 	      _M_curToken = _S_token_collsymbol;
    322 	      _M_eat_collsymbol();
    323 	      return;
    324 	    }
    325 	  else if (*_M_current == _M_ctype.widen(':'))
    326 	    {
    327 	      _M_curToken = _S_token_char_class_name;
    328 	      _M_eat_charclass();
    329 	      return;
    330 	    }
    331 	  else if (*_M_current == _M_ctype.widen('='))
    332 	    {
    333 	      _M_curToken = _S_token_equiv_class_name;
    334 	      _M_eat_equivclass();
    335 	      return;
    336 	    }
    337 	}
    338       else if (*_M_current == _M_ctype.widen('-'))
    339 	{
    340 	  _M_curToken = _S_token_dash;
    341 	  ++_M_current;
    342 	  return;
    343 	}
    344       else if (*_M_current == _M_ctype.widen(']'))
    345 	{
    346 	  if (!(_M_flags & regex_constants::ECMAScript)
    347 	      || !(_M_state & _S_state_at_start))
    348 	    {
    349 	      // special case: only if  _not_ chr first after
    350 	      // '[' or '[^' and if not ECMAscript
    351 	      _M_curToken = _S_token_bracket_end;
    352 	      ++_M_current;
    353 	      return;
    354 	    }
    355 	}
    356       _M_curToken = _S_token_collelem_single;
    357       _M_curValue.assign(1, *_M_current);
    358       ++_M_current;
    359     }
    360 
    361   template<typename _InputIterator>
    362     void
    363     _Scanner<_InputIterator>::
    364     _M_eat_escape()
    365     {
    366       ++_M_current;
    367       if (_M_current == _M_end)
    368 	{
    369 	  _M_curToken = _S_token_eof;
    370 	  return;
    371 	}
    372       _CharT __c = *_M_current;
    373       ++_M_current;
    374 
    375       if (__c == _M_ctype.widen('('))
    376 	{
    377 	  if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
    378 	    {
    379 	      _M_curToken = _S_token_ord_char;
    380 	      _M_curValue.assign(1, __c);
    381 	    }
    382 	  else
    383 	    _M_curToken = _S_token_subexpr_begin;
    384 	}
    385       else if (__c == _M_ctype.widen(')'))
    386 	{
    387 	  if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
    388 	    {
    389 	      _M_curToken = _S_token_ord_char;
    390 	      _M_curValue.assign(1, __c);
    391 	    }
    392 	  else
    393 	    _M_curToken = _S_token_subexpr_end;
    394 	}
    395       else if (__c == _M_ctype.widen('{'))
    396 	{
    397 	  if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
    398 	    {
    399 	      _M_curToken = _S_token_ord_char;
    400 	      _M_curValue.assign(1, __c);
    401 	    }
    402 	  else
    403 	    {
    404 	      _M_curToken = _S_token_interval_begin;
    405 	      _M_state |= _S_state_in_brace;
    406 	    }
    407 	}
    408       else if (__c == _M_ctype.widen('}'))
    409 	{
    410 	  if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
    411 	    {
    412 	      _M_curToken = _S_token_ord_char;
    413 	      _M_curValue.assign(1, __c);
    414 	    }
    415 	  else
    416 	    {
    417 	      if (!(_M_state && _S_state_in_brace))
    418 		__throw_regex_error(regex_constants::error_badbrace);
    419 	      _M_state &= ~_S_state_in_brace;
    420 	      _M_curToken = _S_token_interval_end;
    421 	    }
    422 	}
    423       else if (__c == _M_ctype.widen('x'))
    424 	{
    425 	  ++_M_current;
    426 	  if (_M_current == _M_end)
    427 	    {
    428 	      _M_curToken = _S_token_eof;
    429 	      return;
    430 	    }
    431 	  if (_M_ctype.is(_CtypeT::digit, *_M_current))
    432 	    {
    433 	      _M_curValue.assign(1, *_M_current);
    434 	      ++_M_current;
    435 	      if (_M_current == _M_end)
    436 		{
    437 		  _M_curToken = _S_token_eof;
    438 		  return;
    439 		}
    440 	      if (_M_ctype.is(_CtypeT::digit, *_M_current))
    441 		{
    442 		  _M_curValue += *_M_current;
    443 		  ++_M_current;
    444 		  return;
    445 		}
    446 	    }
    447 	}
    448       else if (__c == _M_ctype.widen('^')
    449 	       || __c == _M_ctype.widen('.')
    450 	       || __c == _M_ctype.widen('*')
    451 	       || __c == _M_ctype.widen('$')
    452 	       || __c == _M_ctype.widen('\\'))
    453 	{
    454 	  _M_curToken = _S_token_ord_char;
    455 	  _M_curValue.assign(1, __c);
    456 	}
    457       else if (_M_ctype.is(_CtypeT::digit, __c))
    458 	{
    459 	  _M_curToken = _S_token_backref;
    460 	  _M_curValue.assign(1, __c);
    461 	}
    462       else
    463 	__throw_regex_error(regex_constants::error_escape);
    464     }
    465 
    466 
    467   // Eats a character class or throwns an exception.
    468   // current point to ':' delimiter on entry, char after ']' on return
    469   template<typename _InputIterator>
    470     void
    471     _Scanner<_InputIterator>::
    472     _M_eat_charclass()
    473     {
    474       ++_M_current; // skip ':'
    475       if (_M_current == _M_end)
    476 	__throw_regex_error(regex_constants::error_ctype);
    477       for (_M_curValue.clear();
    478 	   _M_current != _M_end && *_M_current != _M_ctype.widen(':');
    479 	   ++_M_current)
    480 	_M_curValue += *_M_current;
    481       if (_M_current == _M_end)
    482 	__throw_regex_error(regex_constants::error_ctype);
    483       ++_M_current; // skip ':'
    484       if (*_M_current != _M_ctype.widen(']'))
    485 	__throw_regex_error(regex_constants::error_ctype);
    486       ++_M_current; // skip ']'
    487     }
    488 
    489 
    490   template<typename _InputIterator>
    491     void
    492     _Scanner<_InputIterator>::
    493     _M_eat_equivclass()
    494     {
    495       ++_M_current; // skip '='
    496       if (_M_current == _M_end)
    497 	__throw_regex_error(regex_constants::error_collate);
    498       for (_M_curValue.clear();
    499 	   _M_current != _M_end && *_M_current != _M_ctype.widen('=');
    500 	   ++_M_current)
    501 	_M_curValue += *_M_current;
    502       if (_M_current == _M_end)
    503 	__throw_regex_error(regex_constants::error_collate);
    504       ++_M_current; // skip '='
    505       if (*_M_current != _M_ctype.widen(']'))
    506 	__throw_regex_error(regex_constants::error_collate);
    507       ++_M_current; // skip ']'
    508     }
    509 
    510 
    511   template<typename _InputIterator>
    512     void
    513     _Scanner<_InputIterator>::
    514     _M_eat_collsymbol()
    515     {
    516       ++_M_current; // skip '.'
    517       if (_M_current == _M_end)
    518 	__throw_regex_error(regex_constants::error_collate);
    519       for (_M_curValue.clear();
    520 	   _M_current != _M_end && *_M_current != _M_ctype.widen('.');
    521 	   ++_M_current)
    522 	_M_curValue += *_M_current;
    523       if (_M_current == _M_end)
    524 	__throw_regex_error(regex_constants::error_collate);
    525       ++_M_current; // skip '.'
    526       if (*_M_current != _M_ctype.widen(']'))
    527 	__throw_regex_error(regex_constants::error_collate);
    528       ++_M_current; // skip ']'
    529     }
    530 
    531 #ifdef _GLIBCXX_DEBUG
    532   template<typename _InputIterator>
    533     std::ostream&
    534     _Scanner<_InputIterator>::
    535     _M_print(std::ostream& ostr)
    536     {
    537       switch (_M_curToken)
    538       {
    539 	case _S_token_anychar:
    540 	  ostr << "any-character\n";
    541 	  break;
    542 	case _S_token_backref:
    543 	  ostr << "backref\n";
    544 	  break;
    545 	case _S_token_bracket_begin:
    546 	  ostr << "bracket-begin\n";
    547 	  break;
    548 	case _S_token_bracket_end:
    549 	  ostr << "bracket-end\n";
    550 	  break;
    551 	case _S_token_char_class_name:
    552 	  ostr << "char-class-name \"" << _M_curValue << "\"\n";
    553 	  break;
    554 	case _S_token_closure0:
    555 	  ostr << "closure0\n";
    556 	  break;
    557 	case _S_token_closure1:
    558 	  ostr << "closure1\n";
    559 	  break;
    560 	case _S_token_collelem_multi:
    561 	  ostr << "coll-elem-multi \"" << _M_curValue << "\"\n";
    562 	  break;
    563 	case _S_token_collelem_single:
    564 	  ostr << "coll-elem-single \"" << _M_curValue << "\"\n";
    565 	  break;
    566 	case _S_token_collsymbol:
    567 	  ostr << "collsymbol \"" << _M_curValue << "\"\n";
    568 	  break;
    569 	case _S_token_comma:
    570 	  ostr << "comma\n";
    571 	  break;
    572 	case _S_token_dash:
    573 	  ostr << "dash\n";
    574 	  break;
    575 	case _S_token_dup_count:
    576 	  ostr << "dup count: " << _M_curValue << "\n";
    577 	  break;
    578 	case _S_token_eof:
    579 	  ostr << "EOF\n";
    580 	  break;
    581 	case _S_token_equiv_class_name:
    582 	  ostr << "equiv-class-name \"" << _M_curValue << "\"\n";
    583 	  break;
    584 	case _S_token_interval_begin:
    585 	  ostr << "interval begin\n";
    586 	  break;
    587 	case _S_token_interval_end:
    588 	  ostr << "interval end\n";
    589 	  break;
    590 	case _S_token_line_begin:
    591 	  ostr << "line begin\n";
    592 	  break;
    593 	case _S_token_line_end:
    594 	  ostr << "line end\n";
    595 	  break;
    596 	case _S_token_opt:
    597 	  ostr << "opt\n";
    598 	  break;
    599 	case _S_token_or:
    600 	  ostr << "or\n";
    601 	  break;
    602 	case _S_token_ord_char:
    603 	  ostr << "ordinary character: \"" << _M_value() << "\"\n";
    604 	  break;
    605 	case _S_token_quoted_char:
    606 	  ostr << "quoted char\n";
    607 	  break;
    608 	case _S_token_subexpr_begin:
    609 	  ostr << "subexpr begin\n";
    610 	  break;
    611 	case _S_token_subexpr_end:
    612 	  ostr << "subexpr end\n";
    613 	  break;
    614 	case _S_token_word_begin:
    615 	  ostr << "word begin\n";
    616 	  break;
    617 	case _S_token_word_end:
    618 	  ostr << "word end\n";
    619 	  break;
    620 	case _S_token_unknown:
    621 	  ostr << "-- unknown token --\n";
    622 	  break;
    623       }
    624       return ostr;
    625     }
    626 #endif
    627 
    628   // Builds an NFA from an input iterator interval.
    629   template<typename _InIter, typename _TraitsT>
    630     class _Compiler
    631     {
    632     public:
    633       typedef _InIter                                            _IterT;
    634       typedef typename std::iterator_traits<_InIter>::value_type _CharT;
    635       typedef std::basic_string<_CharT>                          _StringT;
    636       typedef regex_constants::syntax_option_type                _FlagT;
    637 
    638     public:
    639       _Compiler(const _InIter& __b, const _InIter& __e,
    640 		_TraitsT& __traits, _FlagT __flags);
    641 
    642       const _Nfa&
    643       _M_nfa() const
    644       { return _M_state_store; }
    645 
    646     private:
    647       typedef _Scanner<_InIter>                              _ScannerT;
    648       typedef typename _ScannerT::_TokenT                    _TokenT;
    649       typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT;
    650       typedef _RangeMatcher<_InIter, _TraitsT>               _RMatcherT;
    651 
    652       // accepts a specific token or returns false.
    653       bool
    654       _M_match_token(_TokenT __token);
    655 
    656       void
    657       _M_disjunction();
    658 
    659       bool
    660       _M_alternative();
    661 
    662       bool
    663       _M_term();
    664 
    665       bool
    666       _M_assertion();
    667 
    668       bool
    669       _M_quantifier();
    670 
    671       bool
    672       _M_atom();
    673 
    674       bool
    675       _M_bracket_expression();
    676 
    677       bool
    678       _M_bracket_list(_RMatcherT& __matcher);
    679 
    680       bool
    681       _M_follow_list(_RMatcherT& __matcher);
    682 
    683       bool
    684       _M_follow_list2(_RMatcherT& __matcher);
    685 
    686       bool
    687       _M_expression_term(_RMatcherT& __matcher);
    688 
    689       bool
    690       _M_range_expression(_RMatcherT& __matcher);
    691 
    692       bool
    693       _M_start_range(_RMatcherT& __matcher);
    694 
    695       bool
    696       _M_collating_symbol(_RMatcherT& __matcher);
    697 
    698       bool
    699       _M_equivalence_class(_RMatcherT& __matcher);
    700 
    701       bool
    702       _M_character_class(_RMatcherT& __matcher);
    703 
    704       int
    705       _M_cur_int_value(int __radix);
    706 
    707     private:
    708       _TraitsT&      _M_traits;
    709       _ScannerT      _M_scanner;
    710       _StringT       _M_cur_value;
    711       _Nfa           _M_state_store;
    712       _StackT        _M_stack;
    713     };
    714 
    715   template<typename _InIter, typename _TraitsT>
    716     _Compiler<_InIter, _TraitsT>::
    717     _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits,
    718 	      _Compiler<_InIter, _TraitsT>::_FlagT __flags)
    719     : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()),
    720       _M_state_store(__flags)
    721     {
    722       typedef _StartTagger<_InIter, _TraitsT> _Start;
    723       typedef _EndTagger<_InIter, _TraitsT> _End;
    724 
    725       _StateSeq __r(_M_state_store,
    726       		    _M_state_store._M_insert_subexpr_begin(_Start(0)));
    727       _M_disjunction();
    728       if (!_M_stack.empty())
    729 	{
    730 	  __r._M_append(_M_stack.top());
    731 	  _M_stack.pop();
    732 	}
    733       __r._M_append(_M_state_store._M_insert_subexpr_end(0, _End(0)));
    734       __r._M_append(_M_state_store._M_insert_accept());
    735     }
    736 
    737   template<typename _InIter, typename _TraitsT>
    738     bool
    739     _Compiler<_InIter, _TraitsT>::
    740     _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token)
    741     {
    742       if (token == _M_scanner._M_token())
    743 	{
    744 	  _M_cur_value = _M_scanner._M_value();
    745 	  _M_scanner._M_advance();
    746 	  return true;
    747 	}
    748       return false;
    749     }
    750 
    751   template<typename _InIter, typename _TraitsT>
    752     void
    753     _Compiler<_InIter, _TraitsT>::
    754     _M_disjunction()
    755     {
    756       this->_M_alternative();
    757       if (_M_match_token(_ScannerT::_S_token_or))
    758 	{
    759 	  _StateSeq __alt1 = _M_stack.top(); _M_stack.pop();
    760 	  this->_M_disjunction();
    761 	  _StateSeq __alt2 = _M_stack.top(); _M_stack.pop();
    762 	  _M_stack.push(_StateSeq(__alt1, __alt2));
    763 	}
    764     }
    765 
    766   template<typename _InIter, typename _TraitsT>
    767     bool
    768     _Compiler<_InIter, _TraitsT>::
    769     _M_alternative()
    770     {
    771       if (this->_M_term())
    772 	{
    773 	  _StateSeq __re = _M_stack.top(); _M_stack.pop();
    774 	  this->_M_alternative();
    775 	  if (!_M_stack.empty())
    776 	    {
    777 	      __re._M_append(_M_stack.top());
    778 	      _M_stack.pop();
    779 	    }
    780 	  _M_stack.push(__re);
    781 	  return true;
    782 	}
    783       return false;
    784     }
    785 
    786   template<typename _InIter, typename _TraitsT>
    787     bool
    788     _Compiler<_InIter, _TraitsT>::
    789     _M_term()
    790     {
    791       if (this->_M_assertion())
    792 	return true;
    793       if (this->_M_atom())
    794 	{
    795 	  this->_M_quantifier();
    796 	  return true;
    797 	}
    798       return false;
    799     }
    800 
    801   template<typename _InIter, typename _TraitsT>
    802     bool
    803     _Compiler<_InIter, _TraitsT>::
    804     _M_assertion()
    805     {
    806       if (_M_match_token(_ScannerT::_S_token_line_begin))
    807 	{
    808 	  // __m.push(_Matcher::_S_opcode_line_begin);
    809 	  return true;
    810 	}
    811       if (_M_match_token(_ScannerT::_S_token_line_end))
    812 	{
    813 	  // __m.push(_Matcher::_S_opcode_line_end);
    814 	  return true;
    815 	}
    816       if (_M_match_token(_ScannerT::_S_token_word_begin))
    817 	{
    818 	  // __m.push(_Matcher::_S_opcode_word_begin);
    819 	  return true;
    820 	}
    821       if (_M_match_token(_ScannerT::_S_token_word_end))
    822 	{
    823 	  // __m.push(_Matcher::_S_opcode_word_end);
    824 	  return true;
    825 	}
    826       return false;
    827     }
    828 
    829   template<typename _InIter, typename _TraitsT>
    830     bool
    831     _Compiler<_InIter, _TraitsT>::
    832     _M_quantifier()
    833     {
    834       if (_M_match_token(_ScannerT::_S_token_closure0))
    835 	{
    836 	  if (_M_stack.empty())
    837 	    __throw_regex_error(regex_constants::error_badrepeat);
    838 	  _StateSeq __r(_M_stack.top(), -1);
    839 	  __r._M_append(__r._M_front());
    840 	  _M_stack.pop();
    841 	  _M_stack.push(__r);
    842 	  return true;
    843 	}
    844       if (_M_match_token(_ScannerT::_S_token_closure1))
    845 	{
    846 	  if (_M_stack.empty())
    847 	    __throw_regex_error(regex_constants::error_badrepeat);
    848 	  _StateSeq __r(_M_state_store,
    849 			_M_state_store.
    850 			_M_insert_alt(_S_invalid_state_id,
    851 				      _M_stack.top()._M_front()));
    852 	  _M_stack.top()._M_append(__r);
    853 	  return true;
    854 	}
    855       if (_M_match_token(_ScannerT::_S_token_opt))
    856 	{
    857 	  if (_M_stack.empty())
    858 	  __throw_regex_error(regex_constants::error_badrepeat);
    859 	  _StateSeq __r(_M_stack.top(), -1);
    860 	  _M_stack.pop();
    861 	  _M_stack.push(__r);
    862 	  return true;
    863 	}
    864       if (_M_match_token(_ScannerT::_S_token_interval_begin))
    865 	{
    866 	  if (_M_stack.empty())
    867 	    __throw_regex_error(regex_constants::error_badrepeat);
    868 	  if (!_M_match_token(_ScannerT::_S_token_dup_count))
    869 	    __throw_regex_error(regex_constants::error_badbrace);
    870 	  _StateSeq __r(_M_stack.top());
    871 	  int __min_rep = _M_cur_int_value(10);
    872 	  for (int __i = 1; __i < __min_rep; ++__i)
    873 	    _M_stack.top()._M_append(__r._M_clone());
    874 	  if (_M_match_token(_ScannerT::_S_token_comma))
    875 	    if (_M_match_token(_ScannerT::_S_token_dup_count))
    876 	      {
    877 		int __n = _M_cur_int_value(10) - __min_rep;
    878 		if (__n < 0)
    879 		  __throw_regex_error(regex_constants::error_badbrace);
    880 		for (int __i = 0; __i < __n; ++__i)
    881 		  {
    882 		    _StateSeq __r(_M_state_store,
    883 				  _M_state_store.
    884 				  _M_insert_alt(_S_invalid_state_id,
    885 						_M_stack.top()._M_front()));
    886 		    _M_stack.top()._M_append(__r);
    887 		  }
    888 	      }
    889 	    else
    890 	      {
    891 		_StateSeq __r(_M_stack.top(), -1);
    892 		__r._M_push_back(__r._M_front());
    893 		_M_stack.pop();
    894 		_M_stack.push(__r);
    895 	      }
    896 	  if (!_M_match_token(_ScannerT::_S_token_interval_end))
    897 	    __throw_regex_error(regex_constants::error_brace);
    898 	  return true;
    899 	}
    900       return false;
    901     }
    902 
    903   template<typename _InIter, typename _TraitsT>
    904     bool
    905     _Compiler<_InIter, _TraitsT>::
    906     _M_atom()
    907     {
    908       typedef _CharMatcher<_InIter, _TraitsT> _CMatcher;
    909       typedef _StartTagger<_InIter, _TraitsT> _Start;
    910       typedef _EndTagger<_InIter, _TraitsT> _End;
    911 
    912       if (_M_match_token(_ScannerT::_S_token_anychar))
    913 	{
    914 	  _M_stack.push(_StateSeq(_M_state_store,
    915                                   _M_state_store._M_insert_matcher
    916                                   (_AnyMatcher)));
    917 	  return true;
    918 	}
    919       if (_M_match_token(_ScannerT::_S_token_ord_char))
    920 	{
    921 	  _M_stack.push(_StateSeq(_M_state_store,
    922                                   _M_state_store._M_insert_matcher
    923                                   (_CMatcher(_M_cur_value[0], _M_traits))));
    924 	  return true;
    925 	}
    926       if (_M_match_token(_ScannerT::_S_token_quoted_char))
    927 	{
    928 	  // note that in the ECMA grammar, this case covers backrefs.
    929 	  _M_stack.push(_StateSeq(_M_state_store,
    930 				  _M_state_store._M_insert_matcher
    931 				  (_CMatcher(_M_cur_value[0], _M_traits))));
    932 	  return true;
    933 	}
    934       if (_M_match_token(_ScannerT::_S_token_backref))
    935 	{
    936 	  // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
    937 	  return true;
    938 	}
    939       if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
    940 	{
    941 	  int __mark = _M_state_store._M_sub_count();
    942 	  _StateSeq __r(_M_state_store,
    943 			_M_state_store.
    944 			_M_insert_subexpr_begin(_Start(__mark)));
    945 	  this->_M_disjunction();
    946 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
    947 	    __throw_regex_error(regex_constants::error_paren);
    948 	  if (!_M_stack.empty())
    949 	    {
    950 	      __r._M_append(_M_stack.top());
    951 	      _M_stack.pop();
    952 	    }
    953 	  __r._M_append(_M_state_store._M_insert_subexpr_end
    954 			(__mark, _End(__mark)));
    955 	  _M_stack.push(__r);
    956 	  return true;
    957 	}
    958       return _M_bracket_expression();
    959     }
    960 
    961   template<typename _InIter, typename _TraitsT>
    962     bool
    963     _Compiler<_InIter, _TraitsT>::
    964     _M_bracket_expression()
    965     {
    966       if (_M_match_token(_ScannerT::_S_token_bracket_begin))
    967 	{
    968 	  _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin),
    969 			       _M_traits);
    970 	  if (!_M_bracket_list(__matcher)
    971 	      || !_M_match_token(_ScannerT::_S_token_bracket_end))
    972 	    __throw_regex_error(regex_constants::error_brack);
    973 	  _M_stack.push(_StateSeq(_M_state_store,
    974 				  _M_state_store._M_insert_matcher(__matcher)));
    975 	  return true;
    976 	}
    977       return false;
    978     }
    979 
    980   // If the dash is the last character in the bracket expression, it is not
    981   // special.
    982   template<typename _InIter, typename _TraitsT>
    983     bool
    984     _Compiler<_InIter, _TraitsT>::
    985     _M_bracket_list(_RMatcherT& __matcher)
    986     {
    987       if (_M_follow_list(__matcher))
    988 	{
    989 	  if (_M_match_token(_ScannerT::_S_token_dash))
    990 	    __matcher._M_add_char(_M_cur_value[0]);
    991 	  return true;
    992 	}
    993       return false;
    994     }
    995 
    996   template<typename _InIter, typename _TraitsT>
    997     bool
    998     _Compiler<_InIter, _TraitsT>::
    999     _M_follow_list(_RMatcherT& __matcher)
   1000     { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); }
   1001 
   1002   template<typename _InIter, typename _TraitsT>
   1003     bool
   1004     _Compiler<_InIter, _TraitsT>::
   1005     _M_follow_list2(_RMatcherT& __matcher)
   1006     {
   1007       if (_M_expression_term(__matcher))
   1008 	return _M_follow_list2(__matcher);
   1009       return true;
   1010     }
   1011 
   1012   template<typename _InIter, typename _TraitsT>
   1013     bool
   1014     _Compiler<_InIter, _TraitsT>::
   1015     _M_expression_term(_RMatcherT& __matcher)
   1016     {
   1017       return (_M_collating_symbol(__matcher)
   1018 	      || _M_character_class(__matcher)
   1019 	      || _M_equivalence_class(__matcher)
   1020 	      || (_M_start_range(__matcher)
   1021 		  && _M_range_expression(__matcher)));
   1022     }
   1023 
   1024   template<typename _InIter, typename _TraitsT>
   1025     bool
   1026     _Compiler<_InIter, _TraitsT>::
   1027     _M_range_expression(_RMatcherT& __matcher)
   1028     {
   1029       if (!_M_collating_symbol(__matcher))
   1030 	if (!_M_match_token(_ScannerT::_S_token_dash))
   1031 	  __throw_regex_error(regex_constants::error_range);
   1032       __matcher._M_make_range();
   1033       return true;
   1034     }
   1035 
   1036   template<typename _InIter, typename _TraitsT>
   1037     bool
   1038     _Compiler<_InIter, _TraitsT>::
   1039     _M_start_range(_RMatcherT& __matcher)
   1040     { return _M_match_token(_ScannerT::_S_token_dash); }
   1041 
   1042   template<typename _InIter, typename _TraitsT>
   1043     bool
   1044     _Compiler<_InIter, _TraitsT>::
   1045     _M_collating_symbol(_RMatcherT& __matcher)
   1046     {
   1047       if (_M_match_token(_ScannerT::_S_token_collelem_single))
   1048 	{
   1049 	  __matcher._M_add_char(_M_cur_value[0]);
   1050 	  return true;
   1051 	}
   1052       if (_M_match_token(_ScannerT::_S_token_collsymbol))
   1053 	{
   1054 	  __matcher._M_add_collating_element(_M_cur_value);
   1055 	  return true;
   1056 	}
   1057       return false;
   1058     }
   1059 
   1060   template<typename _InIter, typename _TraitsT>
   1061     bool
   1062     _Compiler<_InIter, _TraitsT>::
   1063     _M_equivalence_class(_RMatcherT& __matcher)
   1064     {
   1065       if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
   1066 	{
   1067 	  __matcher._M_add_equivalence_class(_M_cur_value);
   1068 	  return true;
   1069 	}
   1070       return false;
   1071     }
   1072 
   1073   template<typename _InIter, typename _TraitsT>
   1074     bool
   1075     _Compiler<_InIter, _TraitsT>::
   1076     _M_character_class(_RMatcherT& __matcher)
   1077     {
   1078       if (_M_match_token(_ScannerT::_S_token_char_class_name))
   1079 	{
   1080 	  __matcher._M_add_character_class(_M_cur_value);
   1081 	  return true;
   1082 	}
   1083       return false;
   1084     }
   1085 
   1086   template<typename _InIter, typename _TraitsT>
   1087     int
   1088     _Compiler<_InIter, _TraitsT>::
   1089     _M_cur_int_value(int __radix)
   1090     {
   1091       int __v = 0;
   1092       for (typename _StringT::size_type __i = 0;
   1093 	   __i < _M_cur_value.length(); ++__i)
   1094 	__v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix);
   1095       return __v;
   1096     }
   1097 
   1098   template<typename _InIter, typename _TraitsT>
   1099     _AutomatonPtr
   1100     __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t,
   1101 	      regex_constants::syntax_option_type __f)
   1102     { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t,
   1103                                         __f)._M_nfa())); }
   1104 
   1105 _GLIBCXX_END_NAMESPACE_VERSION
   1106 } // namespace __regex
   1107 } // namespace std
   1108 
   1109 /* vim: set ts=8 sw=2 sts=2: */
   1110