Home | History | Annotate | Download | only in bits
      1 // class template regex -*- C++ -*-
      2 
      3 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /**
     26  *  @file bits/regex_nfa.h
     27  *  This is an internal header file, included by other library headers.
     28  *  Do not attempt to use it directly. @headername{regex}
     29  */
     30 
     31 namespace std _GLIBCXX_VISIBILITY(default)
     32 {
     33 namespace __regex
     34 {
     35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     36 
     37   // Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
     38   class _Automaton
     39   {
     40   public:
     41     typedef unsigned int _SizeT;
     42 
     43   public:
     44     virtual
     45     ~_Automaton() { }
     46 
     47     virtual _SizeT
     48     _M_sub_count() const = 0;
     49 
     50 #ifdef _GLIBCXX_DEBUG
     51     virtual std::ostream&
     52     _M_dot(std::ostream& __ostr) const = 0;
     53 #endif
     54   };
     55 
     56   // Generic shared pointer to an automaton.
     57   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
     58 
     59   // Operation codes that define the type of transitions within the base NFA
     60   // that represents the regular expression.
     61   enum _Opcode
     62   {
     63       _S_opcode_unknown       =   0,
     64       _S_opcode_alternative   =   1,
     65       _S_opcode_subexpr_begin =   4,
     66       _S_opcode_subexpr_end   =   5,
     67       _S_opcode_match         = 100,
     68       _S_opcode_accept        = 255
     69   };
     70 
     71   // Provides a generic facade for a templated match_results.
     72   struct _Results
     73   {
     74     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
     75     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
     76   };
     77 
     78   // Tags current state (for subexpr begin/end).
     79   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
     80 
     81   template<typename _FwdIterT, typename _TraitsT>
     82     struct _StartTagger
     83     {
     84       explicit
     85       _StartTagger(int __i)
     86       : _M_index(__i)
     87       { }
     88 
     89       void
     90       operator()(const _PatternCursor& __pc, _Results& __r)
     91       { __r._M_set_pos(_M_index, 0, __pc); }
     92 
     93       int       _M_index;
     94     };
     95 
     96   template<typename _FwdIterT, typename _TraitsT>
     97     struct _EndTagger
     98     {
     99       explicit
    100       _EndTagger(int __i)
    101       : _M_index(__i)
    102       { }
    103 
    104       void
    105       operator()(const _PatternCursor& __pc, _Results& __r)
    106       { __r._M_set_pos(_M_index, 1, __pc); }
    107 
    108       int       _M_index;
    109       _FwdIterT _M_pos;
    110     };
    111   // Indicates if current state matches cursor current.
    112   typedef std::function<bool (const _PatternCursor&)> _Matcher;
    113 
    114   // Matches any character
    115   inline bool
    116   _AnyMatcher(const _PatternCursor&)
    117   { return true; }
    118 
    119   // Matches a single character
    120   template<typename _InIterT, typename _TraitsT>
    121     struct _CharMatcher
    122     {
    123       typedef typename _TraitsT::char_type char_type;
    124 
    125       explicit
    126       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
    127       : _M_traits(__t), _M_c(_M_traits.translate(__c))
    128       { }
    129 
    130       bool
    131       operator()(const _PatternCursor& __pc) const
    132       {
    133 	typedef const _SpecializedCursor<_InIterT>& _CursorT;
    134 	_CursorT __c = static_cast<_CursorT>(__pc);
    135 	return _M_traits.translate(__c._M_current()) == _M_c;
    136       }
    137 
    138       const _TraitsT& _M_traits;
    139       char_type       _M_c;
    140     };
    141 
    142   // Matches a character range (bracket expression)
    143   template<typename _InIterT, typename _TraitsT>
    144     struct _RangeMatcher
    145     {
    146       typedef typename _TraitsT::char_type _CharT;
    147       typedef std::basic_string<_CharT>    _StringT;
    148 
    149       explicit
    150       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
    151       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
    152       { }
    153 
    154       bool
    155       operator()(const _PatternCursor& __pc) const
    156       {
    157 	typedef const _SpecializedCursor<_InIterT>& _CursorT;
    158 	_CursorT __c = static_cast<_CursorT>(__pc);
    159 	return true;
    160       }
    161 
    162       void
    163       _M_add_char(_CharT __c)
    164       { }
    165 
    166       void
    167       _M_add_collating_element(const _StringT& __s)
    168       { }
    169 
    170       void
    171       _M_add_equivalence_class(const _StringT& __s)
    172       { }
    173 
    174       void
    175       _M_add_character_class(const _StringT& __s)
    176       { }
    177 
    178       void
    179       _M_make_range()
    180       { }
    181 
    182       const _TraitsT& _M_traits;
    183       bool            _M_is_non_matching;
    184     };
    185 
    186   // Identifies a state in the NFA.
    187   typedef int _StateIdT;
    188 
    189   // The special case in which a state identifier is not an index.
    190   static const _StateIdT _S_invalid_state_id  = -1;
    191 
    192 
    193   // An individual state in an NFA
    194   //
    195   // In this case a "state" is an entry in the NFA definition coupled with its
    196   // outgoing transition(s).  All states have a single outgoing transition,
    197   // except for accepting states (which have no outgoing transitions) and alt
    198   // states, which have two outgoing transitions.
    199   //
    200   struct _State
    201   {
    202     typedef int  _OpcodeT;
    203 
    204     _OpcodeT     _M_opcode;    // type of outgoing transition
    205     _StateIdT    _M_next;      // outgoing transition
    206     _StateIdT    _M_alt;       // for _S_opcode_alternative
    207     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
    208     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
    209     _Matcher     _M_matches;   // for _S_opcode_match
    210 
    211     explicit _State(_OpcodeT __opcode)
    212     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
    213     { }
    214 
    215     _State(const _Matcher& __m)
    216     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
    217     { }
    218 
    219     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
    220     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
    221       _M_tagger(__t)
    222     { }
    223 
    224     _State(_StateIdT __next, _StateIdT __alt)
    225     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
    226     { }
    227 
    228 #ifdef _GLIBCXX_DEBUG
    229     std::ostream&
    230     _M_print(std::ostream& ostr) const;
    231 
    232     // Prints graphviz dot commands for state.
    233     std::ostream&
    234     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
    235 #endif
    236   };
    237 
    238 
    239   // The Grep Matcher works on sets of states.  Here are sets of states.
    240   typedef std::set<_StateIdT> _StateSet;
    241 
    242  // A collection of all states making up an NFA
    243   //
    244   // An NFA is a 4-tuple M = (K, S, s, F), where
    245   //    K is a finite set of states,
    246   //    S is the alphabet of the NFA,
    247   //    s is the initial state,
    248   //    F is a set of final (accepting) states.
    249   //
    250   // This NFA class is templated on S, a type that will hold values of the
    251   // underlying alphabet (without regard to semantics of that alphabet).  The
    252   // other elements of the tuple are generated during construction of the NFA
    253   // and are available through accessor member functions.
    254   //
    255   class _Nfa
    256   : public _Automaton, public std::vector<_State>
    257   {
    258   public:
    259     typedef _State                              _StateT;
    260     typedef unsigned int                        _SizeT;
    261     typedef regex_constants::syntax_option_type _FlagT;
    262 
    263   public:
    264     _Nfa(_FlagT __f)
    265     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
    266     { }
    267 
    268     ~_Nfa()
    269     { }
    270 
    271     _FlagT
    272     _M_options() const
    273     { return _M_flags; }
    274 
    275     _StateIdT
    276     _M_start() const
    277     { return _M_start_state; }
    278 
    279     const _StateSet&
    280     _M_final_states() const
    281     { return _M_accepting_states; }
    282 
    283     _SizeT
    284     _M_sub_count() const
    285     { return _M_subexpr_count; }
    286 
    287     _StateIdT
    288     _M_insert_accept()
    289     {
    290       this->push_back(_StateT(_S_opcode_accept));
    291       _M_accepting_states.insert(this->size()-1);
    292       return this->size()-1;
    293     }
    294 
    295     _StateIdT
    296     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
    297     {
    298       this->push_back(_StateT(__next, __alt));
    299       return this->size()-1;
    300     }
    301 
    302     _StateIdT
    303     _M_insert_matcher(_Matcher __m)
    304     {
    305       this->push_back(_StateT(__m));
    306       return this->size()-1;
    307     }
    308 
    309     _StateIdT
    310     _M_insert_subexpr_begin(const _Tagger& __t)
    311     {
    312       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
    313       return this->size()-1;
    314     }
    315 
    316     _StateIdT
    317     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
    318     {
    319       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
    320       return this->size()-1;
    321     }
    322 
    323 #ifdef _GLIBCXX_DEBUG
    324     std::ostream&
    325     _M_dot(std::ostream& __ostr) const;
    326 #endif
    327 
    328   private:
    329     _FlagT     _M_flags;
    330     _StateIdT  _M_start_state;
    331     _StateSet  _M_accepting_states;
    332     _SizeT     _M_subexpr_count;
    333   };
    334 
    335   // Describes a sequence of one or more %_State, its current start and end(s).
    336   //
    337   // This structure contains fragments of an NFA during construction.
    338   class _StateSeq
    339   {
    340   public:
    341     // Constructs a single-node sequence
    342     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
    343     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
    344     { }
    345     // Constructs a split sequence from two other sequencces
    346     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
    347     : _M_nfa(__e1._M_nfa),
    348       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
    349       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
    350     { }
    351 
    352     // Constructs a split sequence from a single sequence
    353     _StateSeq(const _StateSeq& __e, _StateIdT __id)
    354     : _M_nfa(__e._M_nfa),
    355       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
    356       _M_end1(__id), _M_end2(__e._M_end1)
    357     { }
    358 
    359     // Constructs a copy of a %_StateSeq
    360     _StateSeq(const _StateSeq& __rhs)
    361     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
    362       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
    363     { }
    364 
    365 
    366     _StateSeq& operator=(const _StateSeq& __rhs);
    367 
    368     _StateIdT
    369     _M_front() const
    370     { return _M_start; }
    371 
    372     // Extends a sequence by one.
    373     void
    374     _M_push_back(_StateIdT __id);
    375 
    376     // Extends and maybe joins a sequence.
    377     void
    378     _M_append(_StateIdT __id);
    379 
    380     void
    381     _M_append(_StateSeq& __rhs);
    382 
    383     // Clones an entire sequence.
    384     _StateIdT
    385     _M_clone();
    386 
    387   private:
    388     _Nfa&     _M_nfa;
    389     _StateIdT _M_start;
    390     _StateIdT _M_end1;
    391     _StateIdT _M_end2;
    392 
    393   };
    394 
    395 _GLIBCXX_END_NAMESPACE_VERSION
    396 } // namespace __regex
    397 } // namespace std
    398 
    399 #include <bits/regex_nfa.tcc>
    400 
    401