Home | History | Annotate | Download | only in include
      1 /** \file
      2  * Defines the interface for an ANTLR3 common token stream. Custom token streams should create
      3  * one of these and then override any functions by installing their own pointers
      4  * to implement the various functions.
      5  */
      6 #ifndef	_ANTLR3_TOKENSTREAM_HPP
      7 #define	_ANTLR3_TOKENSTREAM_HPP
      8 
      9 // [The "BSD licence"]
     10 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
     11 
     12 //
     13 // All rights reserved.
     14 //
     15 // Redistribution and use in source and binary forms, with or without
     16 // modification, are permitted provided that the following conditions
     17 // are met:
     18 // 1. Redistributions of source code must retain the above copyright
     19 //    notice, this list of conditions and the following disclaimer.
     20 // 2. Redistributions in binary form must reproduce the above copyright
     21 //    notice, this list of conditions and the following disclaimer in the
     22 //    documentation and/or other materials provided with the distribution.
     23 // 3. The name of the author may not be used to endorse or promote products
     24 //    derived from this software without specific prior written permission.
     25 //
     26 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     27 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     28 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     29 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     30 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     31 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     32 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     33 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     34 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     35 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     36 
     37 #include    "antlr3defs.hpp"
     38 
     39 /** Definition of a token source, which has a pointer to a function that
     40  *  returns the next token (using a token factory if it is going to be
     41  *  efficient) and a pointer to an ANTLR3_INPUT_STREAM. This is slightly
     42  *  different to the Java interface because we have no way to implement
     43  *  multiple interfaces without defining them in the interface structure
     44  *  or casting (void *), which is too convoluted.
     45  */
     46 ANTLR_BEGIN_NAMESPACE()
     47 
     48 //We are not making it subclass AllocPolicy, as this will always be a base class
     49 template<class ImplTraits>
     50 class TokenSource
     51 {
     52 public:
     53 	typedef typename ImplTraits::CommonTokenType TokenType;
     54 	typedef TokenType CommonTokenType;
     55 	typedef typename ImplTraits::StringType StringType;
     56 	typedef typename ImplTraits::LexerType LexerType;
     57 
     58 private:
     59     /** A special pre-allocated token, which signifies End Of Tokens. Because this must
     60      *  be set up with the current input index and so on, we embed the structure and
     61      *  return the address of it. It is marked as factoryMade, so that it is never
     62      *  attempted to be freed.
     63      */
     64     TokenType				m_eofToken;
     65 
     66 	/// A special pre-allocated token, which is returned by mTokens() if the
     67 	/// lexer rule said to just skip the generated token altogether.
     68 	/// Having this single token stops us wasting memory by have the token factory
     69 	/// actually create something that we are going to SKIP(); anyway.
     70 	///
     71 	TokenType				m_skipToken;
     72 
     73     /** When the token source is constructed, it is populated with the file
     74      *  name from whence the tokens were produced by the lexer. This pointer is a
     75      *  copy of the one supplied by the CharStream (and may be NULL) so should
     76      *  not be manipulated other than to copy or print it.
     77      */
     78     StringType				m_fileName;
     79 
     80 public:
     81 	TokenType& get_eofToken();
     82 	const TokenType& get_eofToken() const;
     83 	TokenType& get_skipToken();
     84 	StringType& get_fileName();
     85 	LexerType* get_super();
     86 
     87 	void set_fileName( const StringType& fileName );
     88 
     89 	/**
     90 	 * \brief
     91 	 * Default implementation of the nextToken() call for a lexer.
     92 	 *
     93 	 * \param toksource
     94 	 * Points to the implementation of a token source. The lexer is
     95 	 * addressed by the super structure pointer.
     96 	 *
     97 	 * \returns
     98 	 * The next token in the current input stream or the EOF token
     99 	 * if there are no more tokens in any input stream in the stack.
    100 	 *
    101 	 * Write detailed description for nextToken here.
    102 	 *
    103 	 * \remarks
    104 	 * Write remarks for nextToken here.
    105 	 *
    106 	 * \see nextTokenStr
    107 	 */
    108     TokenType*  nextToken();
    109 	CommonTokenType* nextToken( BoolForwarder<true> /*isFiltered*/ );
    110 	CommonTokenType* nextToken( BoolForwarder<false> /*isFiltered*/ );
    111 
    112 	///
    113 	/// \brief
    114 	/// Returns the next available token from the current input stream.
    115 	///
    116 	/// \param toksource
    117 	/// Points to the implementation of a token source. The lexer is
    118 	/// addressed by the super structure pointer.
    119 	///
    120 	/// \returns
    121 	/// The next token in the current input stream or the EOF token
    122 	/// if there are no more tokens.
    123 	///
    124 	/// \remarks
    125 	/// Write remarks for nextToken here.
    126 	///
    127 	/// \see nextToken
    128 	///
    129 	TokenType*	nextTokenStr();
    130 
    131 protected:
    132 	TokenSource();
    133 };
    134 
    135 /** Definition of the ANTLR3 common token stream interface.
    136  * \remark
    137  * Much of the documentation for this interface is stolen from Ter's Java implementation.
    138  */
    139 template<class ImplTraits>
    140 class TokenStream  : public ImplTraits::TokenIntStreamType
    141 {
    142 public:
    143 	typedef typename ImplTraits::TokenSourceType TokenSourceType;
    144 	typedef typename ImplTraits::TokenIntStreamType IntStreamType;
    145 	typedef typename ImplTraits::CommonTokenType TokenType;
    146 	typedef TokenType UnitType;
    147 	typedef typename ImplTraits::StringType StringType;
    148 	typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
    149 	typedef typename ImplTraits::TokenStreamType TokenStreamType;
    150 	typedef typename ImplTraits::ParserType ComponentType;
    151 
    152 protected:
    153     /** Pointer to the token source for this stream
    154      */
    155     TokenSourceType*    m_tokenSource;
    156 
    157 	/// Debugger interface, is this is a debugging token stream
    158 	///
    159 	DebugEventListenerType*	m_debugger;
    160 
    161 	/// Indicates the initial stream state for dbgConsume()
    162 	///
    163 	bool				m_initialStreamState;
    164 
    165 public:
    166 	TokenStream(TokenSourceType* source, DebugEventListenerType* debugger);
    167 	IntStreamType* get_istream();
    168 	TokenSourceType* get_tokenSource() const;
    169 	void set_tokenSource( TokenSourceType* tokenSource );
    170 
    171     /** Get Token at current input pointer + i ahead where i=1 is next Token.
    172      *  i<0 indicates tokens in the past.  So -1 is previous token and -2 is
    173      *  two tokens ago. LT(0) is undefined.  For i>=n, return Token.EOFToken.
    174      *  Return null for LT(0) and any index that results in an absolute address
    175      *  that is negative.
    176      */
    177     const TokenType*  _LT(ANTLR_INT32 k);
    178 
    179     /** Where is this stream pulling tokens from?  This is not the name, but
    180      *  a pointer into an interface that contains a ANTLR3_TOKEN_SOURCE interface.
    181      *  The Token Source interface contains a pointer to the input stream and a pointer
    182      *  to a function that returns the next token.
    183      */
    184     TokenSourceType*   getTokenSource();
    185 
    186     /** Function that installs a token source for teh stream
    187      */
    188     void	setTokenSource(TokenSourceType*   tokenSource);
    189 
    190     /** Return the text of all the tokens in the stream, as the old tramp in
    191      *  Leeds market used to say; "Get the lot!"
    192      */
    193     StringType	toString();
    194 
    195     /** Return the text of all tokens from start to stop, inclusive.
    196      *  If the stream does not buffer all the tokens then it can just
    197      *  return an empty ANTLR3_STRING or NULL;  Grammars should not access $ruleLabel.text in
    198      *  an action in that case.
    199      */
    200     StringType	 toStringSS(ANTLR_MARKER start, ANTLR_MARKER stop);
    201 
    202     /** Because the user is not required to use a token with an index stored
    203      *  in it, we must provide a means for two token objects themselves to
    204      *  indicate the start/end location.  Most often this will just delegate
    205      *  to the other toString(int,int).  This is also parallel with
    206      *  the pTREENODE_STREAM->toString(Object,Object).
    207      */
    208     StringType	 toStringTT(const TokenType* start, const TokenType* stop);
    209 
    210 
    211     /** Function that sets the token stream into debugging mode
    212      */
    213     void	setDebugListener(DebugEventListenerType* debugger);
    214 
    215 	TokenStream();
    216 
    217 };
    218 
    219 /** Common token stream is an implementation of ANTLR_TOKEN_STREAM for the default
    220  *  parsers and recognizers. You may of course build your own implementation if
    221  *  you are so inclined.
    222  */
    223 template<bool TOKENS_ACCESSED_FROM_OWNING_RULE, class ListType, class MapType>
    224 class TokenStoreSelector
    225 {
    226 public:
    227 	typedef ListType TokensType;
    228 };
    229 
    230 template<class ListType, class MapType>
    231 class TokenStoreSelector<true, ListType, MapType>
    232 {
    233 public:
    234 	typedef MapType TokensType;
    235 };
    236 
    237 template<class ImplTraits>
    238 class	CommonTokenStream : public TokenStream<ImplTraits>
    239 {
    240 public:
    241 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
    242 	typedef typename ImplTraits::BitsetType BitsetType;
    243 	typedef typename ImplTraits::CommonTokenType TokenType;
    244 	typedef typename ImplTraits::TokenSourceType TokenSourceType;
    245 	typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
    246 	typedef typename AllocPolicyType::template ListType<TokenType> TokensListType;
    247 	typedef typename AllocPolicyType::template OrderedMapType<ANTLR_MARKER, TokenType> TokensMapType;
    248 	typedef typename TokenStoreSelector< ImplTraits::TOKENS_ACCESSED_FROM_OWNING_RULE,
    249 	                                       TokensListType, TokensMapType >::TokensType TokensType;
    250 
    251 	typedef typename AllocPolicyType::template UnOrderedMapType<ANTLR_UINT32, ANTLR_UINT32> ChannelOverridesType;
    252 	typedef typename AllocPolicyType::template OrderedSetType<ANTLR_UINT32> DiscardSetType;
    253 	typedef typename AllocPolicyType::template ListType<ANTLR_UINT32> IntListType;
    254 	typedef TokenStream<ImplTraits> BaseType;
    255 
    256 private:
    257     /** Records every single token pulled from the source indexed by the token index.
    258      *  There might be more efficient ways to do this, such as referencing directly in to
    259      *  the token factory pools, but for now this is convenient and the ANTLR3_LIST is not
    260      *  a huge overhead as it only stores pointers anyway, but allows for iterations and
    261      *  so on.
    262      */
    263     TokensType			m_tokens;
    264 
    265     /** Override map of tokens. If a token type has an entry in here, then
    266      *  the pointer in the table points to an int, being the override channel number
    267      *  that should always be used for this token type.
    268      */
    269     ChannelOverridesType	m_channelOverrides;
    270 
    271     /** Discared set. If a token has an entry in this table, then it is thrown
    272      *  away (data pointer is always NULL).
    273      */
    274     DiscardSetType			m_discardSet;
    275 
    276     /* The channel number that this token stream is tuned to. For instance, whitespace
    277      * is usually tuned to channel 99, which no token stream would normally tune to and
    278      * so it is thrown away.
    279      */
    280     ANTLR_UINT32			m_channel;
    281 
    282 	/** The index into the tokens list of the current token (the next one that will be
    283      *  consumed. p = -1 indicates that the token list is empty.
    284      */
    285     ANTLR_INT32				m_p;
    286 
    287 	/* The total number of tokens issued till now. For streams that delete tokens,
    288 	   this helps in issuing the index
    289 	 */
    290 	ANTLR_UINT32			m_nissued;
    291 
    292     /** If this flag is set to true, then tokens that the stream sees that are not
    293      *  in the channel that this stream is tuned to, are not tracked in the
    294      *  tokens table. When set to false, ALL tokens are added to the tracking.
    295      */
    296     bool					m_discardOffChannel;
    297 
    298 public:
    299 	CommonTokenStream(ANTLR_UINT32 hint, TokenSourceType* source = NULL,
    300 										DebugEventListenerType* debugger = NULL);
    301 	~CommonTokenStream();
    302 	TokensType& get_tokens();
    303 	const TokensType& get_tokens() const;
    304 	DiscardSetType& get_discardSet();
    305 	const DiscardSetType& get_discardSet() const;
    306 	ANTLR_INT32 get_p() const;
    307 	void set_p( ANTLR_INT32 p );
    308 	void inc_p();
    309 	void dec_p();
    310 
    311     /** A simple filter mechanism whereby you can tell this token stream
    312      *  to force all tokens of type ttype to be on channel.  For example,
    313      *  when interpreting, we cannot exec actions so we need to tell
    314      *  the stream to force all WS and NEWLINE to be a different, ignored
    315      *  channel.
    316      */
    317     void setTokenTypeChannel(ANTLR_UINT32 ttype, ANTLR_UINT32 channel);
    318 
    319     /** Add a particular token type to the discard set. If a token is found to belong
    320      *  to this set, then it is skipped/thrown away
    321      */
    322     void discardTokenType(ANTLR_INT32 ttype);
    323 
    324 	//This will discard tokens of a particular rule after the rule execution completion
    325 	void discardTokens( ANTLR_MARKER start, ANTLR_MARKER stop );
    326 	void discardTokens( ANTLR_MARKER start, ANTLR_MARKER stop,
    327 								BoolForwarder<true>  tokens_accessed_from_owning_rule  );
    328 	void discardTokens( ANTLR_MARKER start, ANTLR_MARKER stop,
    329 								BoolForwarder<false>  tokens_accessed_from_owning_rule  );
    330 
    331 	void insertToken( const TokenType& tok );
    332 	void insertToken( const TokenType& tok, BoolForwarder<true>  tokens_accessed_from_owning_rule  );
    333 	void insertToken( const TokenType& tok, BoolForwarder<false>  tokens_accessed_from_owning_rule  );
    334 
    335 	/** Get a token at an absolute index i; 0..n-1.  This is really only
    336      *  needed for profiling and debugging and token stream rewriting.
    337      *  If you don't want to buffer up tokens, then this method makes no
    338      *  sense for you.  Naturally you can't use the rewrite stream feature.
    339      *  I believe DebugTokenStream can easily be altered to not use
    340      *  this method, removing the dependency.
    341      */
    342     const TokenType*   get(ANTLR_MARKER i);
    343 	const TokenType*   getToken(ANTLR_MARKER i);
    344 	const TokenType* getToken( ANTLR_MARKER tok_idx, BoolForwarder<true>  tokens_accessed_from_owning_rule );
    345 	const TokenType* getToken( ANTLR_MARKER tok_idx, BoolForwarder<false>  tokens_accessed_from_owning_rule  );
    346 
    347     /** Signal to discard off channel tokens from here on in.
    348      */
    349     void discardOffChannelToks(bool discard);
    350 
    351     /** Function that returns a pointer to the ANTLR3_LIST of all tokens
    352      *  in the stream (this causes the buffer to fill if we have not get any yet)
    353      */
    354     TokensType*	getTokens();
    355 
    356     /** Function that returns all the tokens between a start and a stop index.
    357      */
    358     void getTokenRange(ANTLR_UINT32 start, ANTLR_UINT32 stop, TokensListType& tokenRange);
    359 
    360     /** Function that returns all the tokens indicated by the specified bitset, within a range of tokens
    361      */
    362     void getTokensSet(ANTLR_UINT32 start, ANTLR_UINT32 stop, BitsetType* types, TokensListType& tokenSet);
    363 
    364     /** Function that returns all the tokens indicated by being a member of the supplied List
    365      */
    366     void getTokensList(ANTLR_UINT32 start, ANTLR_UINT32 stop,
    367 									const IntListType& list, TokensListType& tokenList);
    368 
    369     /** Function that returns all tokens of a certain type within a range.
    370      */
    371     void getTokensType(ANTLR_UINT32 start, ANTLR_UINT32 stop, ANTLR_UINT32 type, TokensListType& tokens);
    372 
    373     /** Function that resets the token stream so that it can be reused, but
    374      *  but that does not free up any resources, such as the token factory
    375      *  the factory pool and so on. This prevents the need to keep freeing
    376      *  and reallocating the token pools if the thing you are building is
    377      *  a multi-shot dameon or somethign like that. It is much faster to
    378      *  just reuse all the vectors.
    379      */
    380     void  reset();
    381 
    382 	const TokenType* LB(ANTLR_INT32 k);
    383 
    384 
    385 	void fillBufferExt();
    386 	void fillBuffer();
    387 
    388 	bool hasReachedFillbufferTarget( ANTLR_UINT32 cnt, BoolForwarder<true>  tokens_accessed_from_owning_rule  );
    389 	bool hasReachedFillbufferTarget( ANTLR_UINT32 cnt, BoolForwarder<false>  tokens_accessed_from_owning_rule  );
    390 
    391 	ANTLR_UINT32 skipOffTokenChannels(ANTLR_INT32 i);
    392 	ANTLR_UINT32 skipOffTokenChannelsReverse(ANTLR_INT32 x);
    393 	ANTLR_MARKER index_impl();
    394 };
    395 
    396 class TokenAccessException : public std::exception
    397 {
    398 	virtual const char* what() const throw()
    399 	{
    400 		return " Attempted access on Deleted Token";
    401 	}
    402 };
    403 
    404 ANTLR_END_NAMESPACE()
    405 
    406 #include "antlr3tokenstream.inl"
    407 
    408 #endif
    409