Home | History | Annotate | Download | only in include
      1 /** \file
      2  * Base interface for any ANTLR3 lexer.
      3  *
      4  * An ANLTR3 lexer builds from two sets of components:
      5  *
      6  *  - The runtime components that provide common functionality such as
      7  *    traversing character streams, building tokens for output and so on.
      8  *  - The generated rules and struutre of the actual lexer, which call upon the
      9  *    runtime components.
     10  *
     11  * A lexer class contains  a character input stream, a base recognizer interface
     12  * (which it will normally implement) and a token source interface (which it also
     13  * implements. The Tokensource interface is called by a token consumer (such as
     14  * a parser, but in theory it can be anything that wants a set of abstract
     15  * tokens in place of a raw character stream.
     16  *
     17  * So then, we set up a lexer in a sequence akin to:
     18  *
     19  *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
     20  *    and initialize it.
     21  *  - Create a lexer interface and tell it where it its input stream is.
     22  *    This will cause the creation of a base recognizer class, which it will
     23  *    override with its own implementations of some methods. The lexer creator
     24  *    can also then in turn override anything it likes.
     25  *  - The lexer token source interface is then passed to some interface that
     26  *    knows how to use it, byte calling for a next token.
     27  *  - When a next token is called, let ze lexing begin.
     28  *
     29  */
     30 #ifndef	_ANTLR3_LEXER
     31 #define	_ANTLR3_LEXER
     32 
     33 // [The "BSD licence"]
     34 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
     35 // http://www.temporal-wave.com
     36 // http://www.linkedin.com/in/jimidle
     37 //
     38 // All rights reserved.
     39 //
     40 // Redistribution and use in source and binary forms, with or without
     41 // modification, are permitted provided that the following conditions
     42 // are met:
     43 // 1. Redistributions of source code must retain the above copyright
     44 //    notice, this list of conditions and the following disclaimer.
     45 // 2. Redistributions in binary form must reproduce the above copyright
     46 //    notice, this list of conditions and the following disclaimer in the
     47 //    documentation and/or other materials provided with the distribution.
     48 // 3. The name of the author may not be used to endorse or promote products
     49 //    derived from this software without specific prior written permission.
     50 //
     51 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     52 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     53 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     54 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     55 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     56 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     57 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     58 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     59 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     60 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     61 
     62 /* Definitions
     63  */
     64 #define	ANTLR3_STRING_TERMINATOR	0xFFFFFFFF
     65 
     66 #include    <antlr3defs.h>
     67 #include    <antlr3input.h>
     68 #include    <antlr3commontoken.h>
     69 #include    <antlr3tokenstream.h>
     70 #include    <antlr3baserecognizer.h>
     71 
     72 #ifdef __cplusplus
     73 extern "C" {
     74 #endif
     75 
     76 typedef	struct ANTLR3_LEXER_struct
     77 {
     78     /** If there is a super structure that is implementing the
     79      *  lexer, then a pointer to it can be stored here in case
     80      *  implementing functions are overridden by this super structure.
     81      */
     82     void	* super;
     83 
     84     /** A generated lexer has an mTokens() function, which needs
     85      *  the context pointer of the generated lexer, not the base lexer interface
     86      *  this is stored here and initialized by the generated code (or manually
     87      *  if this is a manually built lexer.
     88      */
     89     void	* ctx;
     90 
     91     /** A pointer to the character stream whence this lexer is receiving
     92      *  characters.
     93      *  TODO: I may come back to this and implement charstream outside
     94      *  the input stream as per the java implementation.
     95      */
     96     pANTLR3_INPUT_STREAM	input;
     97 
     98     /** Pointer to the implementation of a base recognizer, which the lexer
     99      *  creates and then overrides with its own lexer oriented functions (the
    100      *  default implementation is parser oriented). This also contains a
    101      *  token source interface, which the lexer instance will provide to anything
    102      *  that needs it, which is anything else that implements a base recognizer,
    103      *  such as a parser.
    104      */
    105     pANTLR3_BASE_RECOGNIZER	rec;
    106 
    107     /** Pointer to a function that sets the charstream source for the lexer and
    108      *  causes it to  be reset.
    109      */
    110     void			(*setCharStream)    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
    111 
    112     /** Pointer to a function that switches the current character input stream to
    113      *  a new one, saving the old one, which we will revert to at the end of this
    114      *  new one.
    115      */
    116     void			(*pushCharStream)   (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
    117 
    118     /** Pointer to a function that abandons the current input stream, whether it
    119      *  is empty or not and reverts to the previous stacked input stream.
    120      */
    121     void			(*popCharStream)    (struct ANTLR3_LEXER_struct * lexer);
    122 
    123     /** Pointer to a function that emits the supplied token as the next token in
    124      *  the stream.
    125      */
    126     void			(*emitNew)	    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token);
    127 
    128     /** Pointer to a function that constructs a new token from the lexer stored information
    129      */
    130     pANTLR3_COMMON_TOKEN	(*emit)		    (struct ANTLR3_LEXER_struct * lexer);
    131 
    132     /** Pointer to the user provided (either manually or through code generation
    133      *  function that causes the lexer rules to run the lexing rules and produce
    134      *  the next token if there iss one. This is called from nextToken() in the
    135      *  pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is
    136      *  the generated lexer context (stored in ctx in this interface) it is a generated
    137      *  function and expects the context to be the generated lexer.
    138      */
    139     void	        (*mTokens)		    (void * ctx);
    140 
    141     /** Pointer to a function that attempts to match and consume the specified string from the input
    142      *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
    143      *  with 0xFFFFFFFF, which is an invalid UTF32 character
    144      */
    145     ANTLR3_BOOLEAN	(*matchs)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string);
    146 
    147     /** Pointer to a function that matches and consumes the specified character from the input stream.
    148      *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
    149      *  implementation is source encoding agnostic and so input streams do not generally need to
    150      *  override the default implmentation.
    151      */
    152     ANTLR3_BOOLEAN	(*matchc)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c);
    153 
    154     /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
    155      *  but this would only be useful if the tokens were in tsome guaranteed order which is
    156      *  only going to happen with a hand crafted token set).
    157      */
    158     ANTLR3_BOOLEAN	(*matchRange)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high);
    159 
    160     /** Pointer to a function that matches the next token/char in the input stream
    161      *  regardless of what it actaully is.
    162      */
    163     void		(*matchAny)	    (struct ANTLR3_LEXER_struct * lexer);
    164 
    165     /** Pointer to a function that recovers from an error found in the input stream.
    166      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
    167      *  be from a mismatched token that the (*match)() could not recover from.
    168      */
    169     void		(*recover)	    (struct ANTLR3_LEXER_struct * lexer);
    170 
    171     /** Pointer to function to return the current line number in the input stream
    172      */
    173     ANTLR3_UINT32	(*getLine)		(struct ANTLR3_LEXER_struct * lexer);
    174     ANTLR3_MARKER	(*getCharIndex)		(struct ANTLR3_LEXER_struct * lexer);
    175     ANTLR3_UINT32	(*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer);
    176 
    177     /** Pointer to function to return the text so far for the current token being generated
    178      */
    179     pANTLR3_STRING	(*getText)	    (struct ANTLR3_LEXER_struct * lexer);
    180 
    181 
    182     /** Pointer to a function that knows how to free the resources of a lexer
    183      */
    184     void		(*free)		    (struct ANTLR3_LEXER_struct * lexer);
    185 
    186 }
    187     ANTLR3_LEXER;
    188 
    189 #ifdef __cplusplus
    190 }
    191 #endif
    192 
    193 #endif
    194