1 /** \file 2 * Defines the basic structures used to manipulate character 3 * streams from any input source. Any character size and encoding 4 * can in theory be used, so long as a set of functinos is provided that 5 * can return a 32 bit Integer representation of their characters amd efficiently mark and revert 6 * to specific offsets into their input streams. 7 */ 8 #ifndef _ANTLR3_INPUT_H 9 #define _ANTLR3_INPUT_H 10 11 // [The "BSD licence"] 12 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 13 // http://www.temporal-wave.com 14 // http://www.linkedin.com/in/jimidle 15 // 16 // All rights reserved. 17 // 18 // Redistribution and use in source and binary forms, with or without 19 // modification, are permitted provided that the following conditions 20 // are met: 21 // 1. Redistributions of source code must retain the above copyright 22 // notice, this list of conditions and the following disclaimer. 23 // 2. Redistributions in binary form must reproduce the above copyright 24 // notice, this list of conditions and the following disclaimer in the 25 // documentation and/or other materials provided with the distribution. 26 // 3. The name of the author may not be used to endorse or promote products 27 // derived from this software without specific prior written permission. 28 // 29 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 30 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 31 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 32 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 33 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 34 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 35 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 36 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 38 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 40 #include <antlr3defs.h> 41 #include <antlr3string.h> 42 #include <antlr3commontoken.h> 43 #include <antlr3intstream.h> 44 #include <antlr3convertutf.h> 45 46 #ifdef __cplusplus 47 extern "C" { 48 #endif 49 50 51 52 /// Master context structure for an ANTLR3 C runtime based input stream. 53 /// \ingroup apistructures 54 /// 55 typedef struct ANTLR3_INPUT_STREAM_struct 56 { 57 /** Interfaces that provide streams must all provide 58 * a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM 59 * is no different. 60 */ 61 pANTLR3_INT_STREAM istream; 62 63 /** Whatever super structure is providing the INPUT stream needs a pointer to itself 64 * so that this can be passed back to it whenever the api functions 65 * are called back from this interface. 66 */ 67 void * super; 68 69 /** Pointer the start of the input string, characters may be 70 * taken as offsets from here and in original input format encoding. 71 */ 72 void * data; 73 74 /** Indicates if the data pointer was allocated by us, and so should be freed 75 * when the stream dies. 76 */ 77 int isAllocated; 78 79 /** String factory for this input stream 80 */ 81 pANTLR3_STRING_FACTORY strFactory; 82 83 84 /** Pointer to the next character to be consumed from the input data 85 * This is cast to point at the encoding of the original file that 86 * was read by the functions installed as pointer in this input stream 87 * context instance at file/string/whatever load time. 88 */ 89 void * nextChar; 90 91 /** Number of characters that can be consumed at this point in time. 92 * Mostly this is just what is left in the pre-read buffer, but if the 93 * input source is a stream such as a socket or something then we may 94 * call special read code to wait for more input. 95 */ 96 ANTLR3_UINT32 sizeBuf; 97 98 /** The line number we are traversing in the input file. This gets incremented 99 * by a newline() call in the lexer grammar actions. 100 */ 101 ANTLR3_UINT32 line; 102 103 /** Pointer into the input buffer where the current line 104 * started. 105 */ 106 void * currentLine; 107 108 /** The offset within the current line of the current character 109 */ 110 ANTLR3_INT32 charPositionInLine; 111 112 /** Tracks how deep mark() calls are nested 113 */ 114 ANTLR3_UINT32 markDepth; 115 116 /** List of mark() points in the input stream 117 */ 118 pANTLR3_VECTOR markers; 119 120 /** File name string, set to pointer to memory if 121 * you set it manually as it will be free()d 122 */ 123 pANTLR3_STRING fileName; 124 125 /** File number, needs to be set manually to some file index of your devising. 126 */ 127 ANTLR3_UINT32 fileNo; 128 129 /* API */ 130 131 132 /** Pointer to function that closes the input stream 133 */ 134 void (*close) (struct ANTLR3_INPUT_STREAM_struct * input); 135 void (*free) (struct ANTLR3_INPUT_STREAM_struct * input); 136 137 /** Pointer to function that resets the input stream 138 */ 139 void (*reset) (struct ANTLR3_INPUT_STREAM_struct * input); 140 141 /** Pointer to a function that reuses and resets an input stream by 142 * supplying a new 'source' 143 */ 144 void (*reuse) (struct ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name); 145 146 /** 147 * Pointer to function that installs a version of LA that always 148 * returns upper case. Only valid for character streams and creates a case 149 * insensitive lexer if the lexer tokens are described in upper case. The 150 * tokens will preserve case in the token text. 151 */ 152 void (*setUcaseLA) (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag); 153 154 /** Pointer to function to return input stream element at 1 based 155 * offset from nextChar. Same as _LA for char stream, but token 156 * streams etc. have one of these that does other stuff of course. 157 */ 158 void * (*_LT) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt); 159 160 /** Pointer to function to return the total size of the input buffer. For streams 161 * this may be just the total we have available so far. This means of course that 162 * the input stream must be careful to accumulate enough input so that any backtracking 163 * can be satisfied. 164 */ 165 ANTLR3_UINT32 (*size) (struct ANTLR3_INPUT_STREAM_struct * input); 166 167 /** Pointer to function to return a substring of the input stream. String is returned in allocated 168 * memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form. 169 */ 170 pANTLR3_STRING (*substr) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop); 171 172 /** Pointer to function to return the current line number in the input stream 173 */ 174 ANTLR3_UINT32 (*getLine) (struct ANTLR3_INPUT_STREAM_struct * input); 175 176 /** Pointer to function to return the current line buffer in the input stream 177 * The pointer returned is directly into the input stream so you must copy 178 * it if you wish to manipulate it without damaging the input stream. Encoding 179 * is obviously in the same form as the input stream. 180 * \remark 181 * - Note taht this function wil lbe inaccurate if setLine is called as there 182 * is no way at the moment to position the input stream at a particular line 183 * number offset. 184 */ 185 void * (*getLineBuf) (struct ANTLR3_INPUT_STREAM_struct * input); 186 187 /** Pointer to function to return the current offset in the current input stream line 188 */ 189 ANTLR3_UINT32 (*getCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input); 190 191 /** Pointer to function to set the current line number in the input stream 192 */ 193 void (*setLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line); 194 195 /** Pointer to function to set the current position in the current line. 196 */ 197 void (*setCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position); 198 199 /** Pointer to function to override the default newline character that the input stream 200 * looks for to trigger the line/offset and line buffer recording information. 201 * \remark 202 * - By default the chracter '\n' will be installed as the newline trigger character. When this 203 * character is seen by the consume() function then the current line number is incremented and the 204 * current line offset is reset to 0. The Pointer for the line of input we are consuming 205 * is updated to point to the next character after this one in the input stream (which means it 206 * may become invalid if the last newline character in the file is seen (so watch out). 207 * - If for some reason you do not want the counters and pointers to be restee, you can set the 208 * chracter to some impossible character such as '\0' or whatever. 209 * - This is a single character only, so choose the last character in a sequence of two or more. 210 * - This is only a simple aid to error reporting - if you have a complicated binary input structure 211 * it may not be adequate, but you can always override every function in the input stream with your 212 * own of course, and can even write your own complete input stream set if you like. 213 * - It is your responsiblity to set a valid character for the input stream type. There is no point 214 * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never 215 * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF 216 */ 217 void (*SetNewLineChar) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar); 218 219 /// Character that automatically causes an internal line count 220 /// increment. 221 /// 222 ANTLR3_UCHAR newlineChar; 223 224 /// Indicates the size, in 8 bit units, of a single character. Note that 225 /// the C runtime does not deal with surrogates as this would be 226 /// slow and complicated. If this is a UTF-8 stream then this field 227 /// will be set to 0. Generally you are best working internally with 32 bit characters 228 /// as this is the most efficient. 229 /// 230 ANTLR3_UINT8 charByteSize; 231 232 /// Indicates the encoding scheme used in this input stream 233 /// 234 ANTLR3_UINT32 encoding; 235 } 236 237 ANTLR3_INPUT_STREAM; 238 239 240 /** \brief Structure for track lex input states as part of mark() 241 * and rewind() of lexer. 242 */ 243 typedef struct ANTLR3_LEX_STATE_struct 244 { 245 /** Pointer to the next character to be consumed from the input data 246 * This is cast to point at the encoding of the original file that 247 * was read by the functions installed as pointer in this input stream 248 * context instance at file/string/whatever load time. 249 */ 250 void * nextChar; 251 252 /** The line number we are traversing in the input file. This gets incremented 253 * by a newline() call in the lexer grammer actions. 254 */ 255 ANTLR3_UINT32 line; 256 257 /** Pointer into the input buffer where the current line 258 * started. 259 */ 260 void * currentLine; 261 262 /** The offset within the current line of the current character 263 */ 264 ANTLR3_INT32 charPositionInLine; 265 266 } 267 ANTLR3_LEX_STATE; 268 269 /* Prototypes 270 */ 271 void antlr38BitSetupStream (pANTLR3_INPUT_STREAM input); 272 void antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); 273 void antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); 274 void antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input); 275 void antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input); 276 void antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input); 277 #ifdef __cplusplus 278 } 279 #endif 280 281 #endif /* _ANTLR3_INPUT_H */ 282