Home | History | Annotate | Download | only in include
      1 /** \file
      2  * Defines the basic structures used to manipulate character
      3  * streams from any input source. Any character size and encoding
      4  * can in theory be used, so long as a set of functinos is provided that
      5  * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
      6  * to specific offsets into their input streams.
      7  */
      8 #ifndef	_ANTLR3_INPUT_H
      9 #define	_ANTLR3_INPUT_H
     10 
     11 // [The "BSD licence"]
     12 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
     13 // http://www.temporal-wave.com
     14 // http://www.linkedin.com/in/jimidle
     15 //
     16 // All rights reserved.
     17 //
     18 // Redistribution and use in source and binary forms, with or without
     19 // modification, are permitted provided that the following conditions
     20 // are met:
     21 // 1. Redistributions of source code must retain the above copyright
     22 //    notice, this list of conditions and the following disclaimer.
     23 // 2. Redistributions in binary form must reproduce the above copyright
     24 //    notice, this list of conditions and the following disclaimer in the
     25 //    documentation and/or other materials provided with the distribution.
     26 // 3. The name of the author may not be used to endorse or promote products
     27 //    derived from this software without specific prior written permission.
     28 //
     29 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     30 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     31 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     32 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     33 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     34 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     35 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     36 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     37 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     38 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     39 
     40 #include    <antlr3defs.h>
     41 #include    <antlr3string.h>
     42 #include    <antlr3commontoken.h>
     43 #include    <antlr3intstream.h>
     44 #include    <antlr3convertutf.h>
     45 
     46 #ifdef __cplusplus
     47 extern "C" {
     48 #endif
     49 
     50 
     51 
     52 /// Master context structure for an ANTLR3 C runtime based input stream.
     53 /// \ingroup apistructures
     54 ///
     55 typedef	struct	ANTLR3_INPUT_STREAM_struct
     56 {
     57     /** Interfaces that provide streams must all provide
     58      *  a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM
     59      *  is no different.
     60      */
     61     pANTLR3_INT_STREAM	istream;
     62 
     63     /** Whatever super structure is providing the INPUT stream needs a pointer to itself
     64      *  so that this can be passed back to it whenever the api functions
     65      *  are called back from this interface.
     66      */
     67     void	      * super;
     68 
     69     /** Pointer the start of the input string, characters may be
     70      *  taken as offsets from here and in original input format encoding.
     71      */
     72     void	      *	data;
     73 
     74     /** Indicates if the data pointer was allocated by us, and so should be freed
     75      *  when the stream dies.
     76      */
     77     int			isAllocated;
     78 
     79     /** String factory for this input stream
     80      */
     81     pANTLR3_STRING_FACTORY  strFactory;
     82 
     83 
     84     /** Pointer to the next character to be consumed from the input data
     85      *  This is cast to point at the encoding of the original file that
     86      *  was read by the functions installed as pointer in this input stream
     87      *  context instance at file/string/whatever load time.
     88      */
     89     void	      * nextChar;
     90 
     91     /** Number of characters that can be consumed at this point in time.
     92      *  Mostly this is just what is left in the pre-read buffer, but if the
     93      *  input source is a stream such as a socket or something then we may
     94      *  call special read code to wait for more input.
     95      */
     96     ANTLR3_UINT32	sizeBuf;
     97 
     98     /** The line number we are traversing in the input file. This gets incremented
     99      *  by a newline() call in the lexer grammar actions.
    100      */
    101     ANTLR3_UINT32	line;
    102 
    103     /** Pointer into the input buffer where the current line
    104      *  started.
    105      */
    106     void	      * currentLine;
    107 
    108     /** The offset within the current line of the current character
    109      */
    110     ANTLR3_INT32	charPositionInLine;
    111 
    112     /** Tracks how deep mark() calls are nested
    113      */
    114     ANTLR3_UINT32	markDepth;
    115 
    116     /** List of mark() points in the input stream
    117      */
    118     pANTLR3_VECTOR	markers;
    119 
    120     /** File name string, set to pointer to memory if
    121      * you set it manually as it will be free()d
    122      */
    123     pANTLR3_STRING	fileName;
    124 
    125     /** File number, needs to be set manually to some file index of your devising.
    126      */
    127     ANTLR3_UINT32	fileNo;
    128 
    129     /* API */
    130 
    131 
    132    /** Pointer to function that closes the input stream
    133      */
    134     void		(*close)	(struct	ANTLR3_INPUT_STREAM_struct * input);
    135     void		(*free)		(struct	ANTLR3_INPUT_STREAM_struct * input);
    136 
    137     /** Pointer to function that resets the input stream
    138      */
    139     void		(*reset)	(struct	ANTLR3_INPUT_STREAM_struct * input);
    140 
    141     /** Pointer to a function that reuses and resets an input stream by
    142      *  supplying a new 'source'
    143      */
    144     void                (*reuse)        (struct	ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
    145 
    146     /**
    147      * Pointer to function that installs a version of LA that always
    148      * returns upper case. Only valid for character streams and creates a case
    149      * insensitive lexer if the lexer tokens are described in upper case. The
    150      * tokens will preserve case in the token text.
    151      */
    152     void		(*setUcaseLA)		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
    153 
    154     /** Pointer to function to return input stream element at 1 based
    155      *  offset from nextChar. Same as _LA for char stream, but token
    156      *  streams etc. have one of these that does other stuff of course.
    157      */
    158     void *		(*_LT)		(struct	ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt);
    159 
    160     /** Pointer to function to return the total size of the input buffer. For streams
    161      *  this may be just the total we have available so far. This means of course that
    162      *  the input stream must be careful to accumulate enough input so that any backtracking
    163      *  can be satisfied.
    164      */
    165     ANTLR3_UINT32	(*size)		(struct ANTLR3_INPUT_STREAM_struct * input);
    166 
    167     /** Pointer to function to return a substring of the input stream. String is returned in allocated
    168      *  memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form.
    169      */
    170     pANTLR3_STRING	(*substr)	(struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
    171 
    172     /** Pointer to function to return the current line number in the input stream
    173      */
    174     ANTLR3_UINT32	(*getLine)	(struct ANTLR3_INPUT_STREAM_struct * input);
    175 
    176     /** Pointer to function to return the current line buffer in the input stream
    177      *  The pointer returned is directly into the input stream so you must copy
    178      *  it if you wish to manipulate it without damaging the input stream. Encoding
    179      *  is obviously in the same form as the input stream.
    180      *  \remark
    181      *    - Note taht this function wil lbe inaccurate if setLine is called as there
    182      *      is no way at the moment to position the input stream at a particular line
    183      *	    number offset.
    184      */
    185     void	  *	(*getLineBuf)	(struct ANTLR3_INPUT_STREAM_struct * input);
    186 
    187     /** Pointer to function to return the current offset in the current input stream line
    188      */
    189     ANTLR3_UINT32	(*getCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input);
    190 
    191     /** Pointer to function to set the current line number in the input stream
    192      */
    193     void		(*setLine)		  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line);
    194 
    195     /** Pointer to function to set the current position in the current line.
    196      */
    197     void		(*setCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position);
    198 
    199     /** Pointer to function to override the default newline character that the input stream
    200      *  looks for to trigger the line/offset and line buffer recording information.
    201      *  \remark
    202      *   - By default the chracter '\n' will be installed as the newline trigger character. When this
    203      *     character is seen by the consume() function then the current line number is incremented and the
    204      *     current line offset is reset to 0. The Pointer for the line of input we are consuming
    205      *     is updated to point to the next character after this one in the input stream (which means it
    206      *     may become invalid if the last newline character in the file is seen (so watch out).
    207      *   - If for some reason you do not want the counters and pointers to be restee, you can set the
    208      *     chracter to some impossible character such as '\0' or whatever.
    209      *   - This is a single character only, so choose the last character in a sequence of two or more.
    210      *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
    211      *     it may not be adequate, but you can always override every function in the input stream with your
    212      *     own of course, and can even write your own complete input stream set if you like.
    213      *   - It is your responsiblity to set a valid character for the input stream type. There is no point
    214      *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
    215      *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
    216      */
    217     void		(*SetNewLineChar)	    (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar);
    218 
    219     /// Character that automatically causes an internal line count
    220     ///  increment.
    221     ///
    222     ANTLR3_UCHAR	newlineChar;
    223 
    224     /// Indicates the size, in 8 bit units, of a single character. Note that
    225     /// the C runtime does not deal with surrogates as this would be
    226     /// slow and complicated. If this is a UTF-8 stream then this field
    227     /// will be set to 0. Generally you are best working internally with 32 bit characters
    228     /// as this is the most efficient.
    229     ///
    230     ANTLR3_UINT8	charByteSize;
    231 
    232     /// Indicates the encoding scheme used in this input stream
    233     ///
    234     ANTLR3_UINT32       encoding;
    235 }
    236 
    237     ANTLR3_INPUT_STREAM;
    238 
    239 
    240 /** \brief Structure for track lex input states as part of mark()
    241  *  and rewind() of lexer.
    242  */
    243 typedef	struct	ANTLR3_LEX_STATE_struct
    244 {
    245         /** Pointer to the next character to be consumed from the input data
    246      *  This is cast to point at the encoding of the original file that
    247      *  was read by the functions installed as pointer in this input stream
    248      *  context instance at file/string/whatever load time.
    249      */
    250     void	      * nextChar;
    251 
    252     /** The line number we are traversing in the input file. This gets incremented
    253      *  by a newline() call in the lexer grammer actions.
    254      */
    255     ANTLR3_UINT32	line;
    256 
    257     /** Pointer into the input buffer where the current line
    258      *  started.
    259      */
    260     void	      * currentLine;
    261 
    262     /** The offset within the current line of the current character
    263      */
    264     ANTLR3_INT32	charPositionInLine;
    265 
    266 }
    267     ANTLR3_LEX_STATE;
    268 
    269 /* Prototypes
    270  */
    271 void	    antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input);
    272 void	    antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
    273 void	    antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
    274 void	    antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input);
    275 void	    antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input);
    276 void        antlr3GenericSetupStream    (pANTLR3_INPUT_STREAM input);
    277 #ifdef __cplusplus
    278 }
    279 #endif
    280 
    281 #endif	/* _ANTLR3_INPUT_H  */
    282