1 /** \file 2 * Base interface for any ANTLR3 lexer. 3 * 4 * An ANLTR3 lexer builds from two sets of components: 5 * 6 * - The runtime components that provide common functionality such as 7 * traversing character streams, building tokens for output and so on. 8 * - The generated rules and struutre of the actual lexer, which call upon the 9 * runtime components. 10 * 11 * A lexer class contains a character input stream, a base recognizer interface 12 * (which it will normally implement) and a token source interface (which it also 13 * implements. The Tokensource interface is called by a token consumer (such as 14 * a parser, but in theory it can be anything that wants a set of abstract 15 * tokens in place of a raw character stream. 16 * 17 * So then, we set up a lexer in a sequence akin to: 18 * 19 * - Create a character stream (something which implements ANTLR3_INPUT_STREAM) 20 * and initialize it. 21 * - Create a lexer interface and tell it where it its input stream is. 22 * This will cause the creation of a base recognizer class, which it will 23 * override with its own implementations of some methods. The lexer creator 24 * can also then in turn override anything it likes. 25 * - The lexer token source interface is then passed to some interface that 26 * knows how to use it, byte calling for a next token. 27 * - When a next token is called, let ze lexing begin. 28 * 29 */ 30 #ifndef _ANTLR3_LEXER 31 #define _ANTLR3_LEXER 32 33 // [The "BSD licence"] 34 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 35 // http://www.temporal-wave.com 36 // http://www.linkedin.com/in/jimidle 37 // 38 // All rights reserved. 39 // 40 // Redistribution and use in source and binary forms, with or without 41 // modification, are permitted provided that the following conditions 42 // are met: 43 // 1. Redistributions of source code must retain the above copyright 44 // notice, this list of conditions and the following disclaimer. 45 // 2. Redistributions in binary form must reproduce the above copyright 46 // notice, this list of conditions and the following disclaimer in the 47 // documentation and/or other materials provided with the distribution. 48 // 3. The name of the author may not be used to endorse or promote products 49 // derived from this software without specific prior written permission. 50 // 51 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 52 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 53 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 54 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 55 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 56 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 57 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 58 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 59 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 60 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 61 62 /* Definitions 63 */ 64 #define ANTLR3_STRING_TERMINATOR 0xFFFFFFFF 65 66 #include <antlr3defs.h> 67 #include <antlr3input.h> 68 #include <antlr3commontoken.h> 69 #include <antlr3tokenstream.h> 70 #include <antlr3baserecognizer.h> 71 72 #ifdef __cplusplus 73 extern "C" { 74 #endif 75 76 typedef struct ANTLR3_LEXER_struct 77 { 78 /** If there is a super structure that is implementing the 79 * lexer, then a pointer to it can be stored here in case 80 * implementing functions are overridden by this super structure. 81 */ 82 void * super; 83 84 /** A generated lexer has an mTokens() function, which needs 85 * the context pointer of the generated lexer, not the base lexer interface 86 * this is stored here and initialized by the generated code (or manually 87 * if this is a manually built lexer. 88 */ 89 void * ctx; 90 91 /** A pointer to the character stream whence this lexer is receiving 92 * characters. 93 * TODO: I may come back to this and implement charstream outside 94 * the input stream as per the java implementation. 95 */ 96 pANTLR3_INPUT_STREAM input; 97 98 /** Pointer to the implementation of a base recognizer, which the lexer 99 * creates and then overrides with its own lexer oriented functions (the 100 * default implementation is parser oriented). This also contains a 101 * token source interface, which the lexer instance will provide to anything 102 * that needs it, which is anything else that implements a base recognizer, 103 * such as a parser. 104 */ 105 pANTLR3_BASE_RECOGNIZER rec; 106 107 /** Pointer to a function that sets the charstream source for the lexer and 108 * causes it to be reset. 109 */ 110 void (*setCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input); 111 112 /** Pointer to a function that switches the current character input stream to 113 * a new one, saving the old one, which we will revert to at the end of this 114 * new one. 115 */ 116 void (*pushCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input); 117 118 /** Pointer to a function that abandons the current input stream, whether it 119 * is empty or not and reverts to the previous stacked input stream. 120 */ 121 void (*popCharStream) (struct ANTLR3_LEXER_struct * lexer); 122 123 /** Pointer to a function that emits the supplied token as the next token in 124 * the stream. 125 */ 126 void (*emitNew) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token); 127 128 /** Pointer to a function that constructs a new token from the lexer stored information 129 */ 130 pANTLR3_COMMON_TOKEN (*emit) (struct ANTLR3_LEXER_struct * lexer); 131 132 /** Pointer to the user provided (either manually or through code generation 133 * function that causes the lexer rules to run the lexing rules and produce 134 * the next token if there iss one. This is called from nextToken() in the 135 * pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is 136 * the generated lexer context (stored in ctx in this interface) it is a generated 137 * function and expects the context to be the generated lexer. 138 */ 139 void (*mTokens) (void * ctx); 140 141 /** Pointer to a function that attempts to match and consume the specified string from the input 142 * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated 143 * with 0xFFFFFFFF, which is an invalid UTF32 character 144 */ 145 ANTLR3_BOOLEAN (*matchs) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string); 146 147 /** Pointer to a function that matches and consumes the specified character from the input stream. 148 * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer 149 * implementation is source encoding agnostic and so input streams do not generally need to 150 * override the default implmentation. 151 */ 152 ANTLR3_BOOLEAN (*matchc) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c); 153 154 /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too 155 * but this would only be useful if the tokens were in tsome guaranteed order which is 156 * only going to happen with a hand crafted token set). 157 */ 158 ANTLR3_BOOLEAN (*matchRange) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high); 159 160 /** Pointer to a function that matches the next token/char in the input stream 161 * regardless of what it actaully is. 162 */ 163 void (*matchAny) (struct ANTLR3_LEXER_struct * lexer); 164 165 /** Pointer to a function that recovers from an error found in the input stream. 166 * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 167 * be from a mismatched token that the (*match)() could not recover from. 168 */ 169 void (*recover) (struct ANTLR3_LEXER_struct * lexer); 170 171 /** Pointer to function to return the current line number in the input stream 172 */ 173 ANTLR3_UINT32 (*getLine) (struct ANTLR3_LEXER_struct * lexer); 174 ANTLR3_MARKER (*getCharIndex) (struct ANTLR3_LEXER_struct * lexer); 175 ANTLR3_UINT32 (*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer); 176 177 /** Pointer to function to return the text so far for the current token being generated 178 */ 179 pANTLR3_STRING (*getText) (struct ANTLR3_LEXER_struct * lexer); 180 181 182 /** Pointer to a function that knows how to free the resources of a lexer 183 */ 184 void (*free) (struct ANTLR3_LEXER_struct * lexer); 185 186 } 187 ANTLR3_LEXER; 188 189 #ifdef __cplusplus 190 } 191 #endif 192 193 #endif 194