Home | History | Annotate | Download | only in src
      1 /// \file
      2 /// Base functions to initialize and manipulate any input stream
      3 ///
      4 
      5 // [The "BSD licence"]
      6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
      7 // http://www.temporal-wave.com
      8 // http://www.linkedin.com/in/jimidle
      9 //
     10 // All rights reserved.
     11 //
     12 // Redistribution and use in source and binary forms, with or without
     13 // modification, are permitted provided that the following conditions
     14 // are met:
     15 // 1. Redistributions of source code must retain the above copyright
     16 //    notice, this list of conditions and the following disclaimer.
     17 // 2. Redistributions in binary form must reproduce the above copyright
     18 //    notice, this list of conditions and the following disclaimer in the
     19 //    documentation and/or other materials provided with the distribution.
     20 // 3. The name of the author may not be used to endorse or promote products
     21 //    derived from this software without specific prior written permission.
     22 //
     23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     33 
     34 #include    <antlr3input.h>
     35 
     36 // -----------------------------------
     37 // Generic 8 bit input such as latin-1
     38 //
     39 
     40 // 8Bit INT Stream API
     41 //
     42 static	    void	    antlr38BitConsume		(pANTLR3_INT_STREAM is);
     43 static	    ANTLR3_UCHAR    antlr38BitLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     44 static	    ANTLR3_UCHAR    antlr38BitLA_ucase		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     45 static	    ANTLR3_MARKER   antlr38BitIndex		(pANTLR3_INT_STREAM is);
     46 static	    ANTLR3_MARKER   antlr38BitMark		(pANTLR3_INT_STREAM is);
     47 static	    void	    antlr38BitRewind		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
     48 static	    void	    antlr38BitRewindLast	(pANTLR3_INT_STREAM is);
     49 static	    void	    antlr38BitRelease		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
     50 static	    void	    antlr38BitSeek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
     51 static	    pANTLR3_STRING  antlr38BitGetSourceName	(pANTLR3_INT_STREAM is);
     52 
     53 // 8Bit Charstream API functions
     54 //
     55 static	    void	    antlr3InputClose		(pANTLR3_INPUT_STREAM input);
     56 static	    void	    antlr3InputReset		(pANTLR3_INPUT_STREAM input);
     57 static      void            antlr38BitReuse            (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
     58 static	    void *	    antlr38BitLT		(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
     59 static	    ANTLR3_UINT32   antlr38BitSize		(pANTLR3_INPUT_STREAM input);
     60 static	    pANTLR3_STRING  antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
     61 static	    ANTLR3_UINT32   antlr38BitGetLine		(pANTLR3_INPUT_STREAM input);
     62 static	    void	  * antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input);
     63 static	    ANTLR3_UINT32   antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input);
     64 static	    void	    antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
     65 static	    void	    antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
     66 static	    void	    antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
     67 static	    void	    antlr38BitSetUcaseLA	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
     68 
     69 // -----------------------------------
     70 // UTF16 (also covers UCS2)
     71 //
     72 // INT Stream API
     73 //
     74 static	    void	    antlr3UTF16Consume	        (pANTLR3_INT_STREAM is);
     75 static	    ANTLR3_UCHAR    antlr3UTF16LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     76 static	    void	    antlr3UTF16ConsumeLE        (pANTLR3_INT_STREAM is);
     77 static	    ANTLR3_UCHAR    antlr3UTF16LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     78 static	    void	    antlr3UTF16ConsumeBE        (pANTLR3_INT_STREAM is);
     79 static	    ANTLR3_UCHAR    antlr3UTF16LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     80 static	    ANTLR3_MARKER   antlr3UTF16Index		(pANTLR3_INT_STREAM is);
     81 static	    void	    antlr3UTF16Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
     82 
     83 // UTF16 Charstream API functions
     84 //
     85 static	    pANTLR3_STRING	antlr3UTF16Substr	(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
     86 
     87 // -----------------------------------
     88 // UTF32 (also covers UCS2)
     89 //
     90 // INT Stream API
     91 //
     92 static	    void	    antlr3UTF32Consume	        (pANTLR3_INT_STREAM is);
     93 static	    ANTLR3_UCHAR    antlr3UTF32LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     94 static	    ANTLR3_UCHAR    antlr3UTF32LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     95 static	    ANTLR3_UCHAR    antlr3UTF32LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
     96 static	    ANTLR3_MARKER   antlr3UTF32Index		(pANTLR3_INT_STREAM is);
     97 static	    void	    antlr3UTF32Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
     98 
     99 // UTF16 Charstream API functions
    100 //
    101 static	    pANTLR3_STRING  antlr3UTF32Substr	        (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
    102 
    103 // ------------------------------------
    104 // UTF-8
    105 //
    106 static	    void	    antlr3UTF8Consume	        (pANTLR3_INT_STREAM is);
    107 static	    ANTLR3_UCHAR    antlr3UTF8LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
    108 
    109 // ------------------------------------
    110 // EBCDIC
    111 //
    112 static	    ANTLR3_UCHAR    antlr3EBCDICLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
    113 
    114 /// \brief Common function to setup function interface for an 8 bit input stream.
    115 ///
    116 /// \param input Input stream context pointer
    117 ///
    118 /// \remark
    119 ///   - Many of the 8 bit oriented file stream handling functions will be usable
    120 ///     by any or at least some, other input streams. Therefore it is perfectly acceptable
    121 ///     to call this function to install the 8Bit handler then override just those functions
    122 ///     that would not work for the particular input encoding, such as consume for instance.
    123 ///
    124 void
    125 antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input)
    126 {
    127     // Build a string factory for this stream
    128     //
    129     input->strFactory	= antlr3StringFactoryNew(input->encoding);
    130 
    131     // Default stream API set up is for 8Bit, so we are done
    132     //
    133 }
    134 
    135 void
    136 antlr3GenericSetupStream  (pANTLR3_INPUT_STREAM input)
    137 {
    138     /* Install function pointers for an 8 bit input
    139      */
    140 
    141     /* Allocate stream interface
    142      */
    143     input->istream		= antlr3IntStreamNew();
    144     input->istream->type        = ANTLR3_CHARSTREAM;
    145     input->istream->super       = input;
    146 
    147     /* Intstream API
    148      */
    149     input->istream->consume	    = antlr38BitConsume;	    // Consume the next 8 bit character in the buffer
    150     input->istream->_LA		    = antlr38BitLA;	            // Return the UTF32 character at offset n (1 based)
    151     input->istream->index	    = antlr38BitIndex;	            // Current index (offset from first character
    152     input->istream->mark	    = antlr38BitMark;		    // Record the current lex state for later restore
    153     input->istream->rewind	    = antlr38BitRewind;	            // How to rewind the input
    154     input->istream->rewindLast	    = antlr38BitRewindLast;	    // How to rewind the input
    155     input->istream->seek	    = antlr38BitSeek;		    // How to seek to a specific point in the stream
    156     input->istream->release	    = antlr38BitRelease;	    // Reset marks after mark n
    157     input->istream->getSourceName   = antlr38BitGetSourceName;      // Return a string that names the input source
    158 
    159     /* Charstream API
    160      */
    161     input->close		    =  antlr3InputClose;	    // Close down the stream completely
    162     input->free			    =  antlr3InputClose;	    // Synonym for free
    163     input->reset		    =  antlr3InputReset;	    // Reset input to start
    164     input->reuse                    =  antlr38BitReuse;             // Install a new input string and reset
    165     input->_LT			    =  antlr38BitLT;		    // Same as _LA for 8 bit file
    166     input->size			    =  antlr38BitSize;		    // Return the size of the input buffer
    167     input->substr		    =  antlr38BitSubstr;	    // Return a string from the input stream
    168     input->getLine		    =  antlr38BitGetLine;	    // Return the current line number in the input stream
    169     input->getLineBuf		    =  antlr38BitGetLineBuf;	    // Return a pointer to the start of the current line being consumed
    170     input->getCharPositionInLine    =  antlr38BitGetCharPosition;   // Return the offset into the current line of input
    171     input->setLine		    =  antlr38BitSetLine;	    // Set the input stream line number (does not set buffer pointers)
    172     input->setCharPositionInLine    =  antlr38BitSetCharPosition;   // Set the offset in to the current line (does not set any pointers)
    173     input->SetNewLineChar	    =  antlr38BitSetNewLineChar;    // Set the value of the newline trigger character
    174     input->setUcaseLA		    =  antlr38BitSetUcaseLA;        // Changes the LA function to return upper case always
    175 
    176     input->charByteSize		    = 1;		// Size in bytes of characters in this stream.
    177 
    178     /* Initialize entries for tables etc
    179      */
    180     input->markers  = NULL;
    181 
    182     /* Set up the input stream brand new
    183      */
    184     input->reset(input);
    185 
    186     /* Install default line separator character (it can be replaced
    187      * by the grammar programmer later)
    188      */
    189     input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
    190 }
    191 
    192 static pANTLR3_STRING
    193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
    194 {
    195 	return	is->streamName;
    196 }
    197 
    198 /** \brief Close down an input stream and free any memory allocated by it.
    199  *
    200  * \param input Input stream context pointer
    201  */
    202 static void
    203 antlr3InputClose(pANTLR3_INPUT_STREAM input)
    204 {
    205     // Close any markers in the input stream
    206     //
    207     if	(input->markers != NULL)
    208     {
    209 		input->markers->free(input->markers);
    210 		input->markers = NULL;
    211     }
    212 
    213     // Close the string factory
    214     //
    215     if	(input->strFactory != NULL)
    216     {
    217 		input->strFactory->close(input->strFactory);
    218     }
    219 
    220     // Free the input stream buffer if we allocated it
    221     //
    222     if	(input->isAllocated && input->data != NULL)
    223     {
    224 		ANTLR3_FREE(input->data);
    225 		input->data = NULL;
    226     }
    227 
    228     input->istream->free(input->istream);
    229 
    230     // Finally, free the space for the structure itself
    231     //
    232     ANTLR3_FREE(input);
    233 
    234     // Done
    235     //
    236 }
    237 
    238 static void
    239 antlr38BitSetUcaseLA		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
    240 {
    241 	if	(flag)
    242 	{
    243 		// Return the upper case version of the characters
    244 		//
    245 		input->istream->_LA		    =  antlr38BitLA_ucase;
    246 	}
    247 	else
    248 	{
    249 		// Return the raw characters as they are in the buffer
    250 		//
    251 		input->istream->_LA		    =  antlr38BitLA;
    252 	}
    253 }
    254 
    255 
    256 /** \brief Reset a re-startable input stream to the start
    257  *
    258  * \param input Input stream context pointer
    259  */
    260 static void
    261 antlr3InputReset(pANTLR3_INPUT_STREAM input)
    262 {
    263 
    264     input->nextChar		= input->data;	/* Input at first character */
    265     input->line			= 1;		/* starts at line 1	    */
    266     input->charPositionInLine	= -1;
    267     input->currentLine		= input->data;
    268     input->markDepth		= 0;		/* Reset markers	    */
    269 
    270     /* Clear out up the markers table if it is there
    271      */
    272     if	(input->markers != NULL)
    273     {
    274         input->markers->clear(input->markers);
    275     }
    276     else
    277     {
    278         /* Install a new markers table
    279          */
    280         input->markers  = antlr3VectorNew(0);
    281     }
    282 }
    283 
    284 /** Install a new source code in to a working input stream so that the
    285  *  input stream can be reused.
    286  */
    287 static void
    288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
    289 {
    290     input->isAllocated	= ANTLR3_FALSE;
    291     input->data		= inString;
    292     input->sizeBuf	= size;
    293 
    294     // Now we can set up the file name. As we are reusing the stream, there may already
    295     // be a string that we can reuse for holding the filename.
    296     //
    297 	if	(input->istream->streamName == NULL)
    298 	{
    299 		input->istream->streamName	= input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
    300 		input->fileName		= input->istream->streamName;
    301 	}
    302 	else
    303 	{
    304 		input->istream->streamName->set(input->istream->streamName,  (name == NULL ? (const char *)"-memory-" : (const char *)name));
    305 	}
    306 
    307     input->reset(input);
    308 }
    309 
    310 /** \brief Consume the next character in an 8 bit input stream
    311  *
    312  * \param input Input stream context pointer
    313  */
    314 static void
    315 antlr38BitConsume(pANTLR3_INT_STREAM is)
    316 {
    317     pANTLR3_INPUT_STREAM input;
    318 
    319     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    320 
    321     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    322     {
    323 	/* Indicate one more character in this line
    324 	 */
    325 	input->charPositionInLine++;
    326 
    327 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
    328 	{
    329 	    /* Reset for start of a new line of input
    330 	     */
    331 	    input->line++;
    332 	    input->charPositionInLine	= 0;
    333 	    input->currentLine		= (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
    334 	}
    335 
    336 	/* Increment to next character position
    337 	 */
    338 	input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
    339     }
    340 }
    341 
    342 /** \brief Return the input element assuming an 8 bit ascii input
    343  *
    344  * \param[in] input Input stream context pointer
    345  * \param[in] la 1 based offset of next input stream element
    346  *
    347  * \return Next input character in internal ANTLR3 encoding (UTF32)
    348  */
    349 static ANTLR3_UCHAR
    350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
    351 {
    352     pANTLR3_INPUT_STREAM input;
    353 
    354     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    355 
    356     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    357     {
    358 		return	ANTLR3_CHARSTREAM_EOF;
    359     }
    360     else
    361     {
    362 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
    363     }
    364 }
    365 
    366 /** \brief Return the input element assuming an 8 bit input and
    367  *         always return the UPPER CASE character.
    368  *		   Note that this is 8 bit and so we assume that the toupper
    369  *		   function will use the correct locale for 8 bits.
    370  *
    371  * \param[in] input Input stream context pointer
    372  * \param[in] la 1 based offset of next input stream element
    373  *
    374  * \return Next input character in internal ANTLR3 encoding (UTF32)
    375  */
    376 static ANTLR3_UCHAR
    377 antlr38BitLA_ucase	(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
    378 {
    379     pANTLR3_INPUT_STREAM input;
    380 
    381     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    382 
    383     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    384     {
    385 		return	ANTLR3_CHARSTREAM_EOF;
    386     }
    387     else
    388     {
    389 		return	(ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
    390     }
    391 }
    392 
    393 
    394 /** \brief Return the input element assuming an 8 bit ascii input
    395  *
    396  * \param[in] input Input stream context pointer
    397  * \param[in] lt 1 based offset of next input stream element
    398  *
    399  * \return Next input character in internal ANTLR3 encoding (UTF32)
    400  */
    401 static void *
    402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
    403 {
    404     /* Casting is horrible but it means no warnings and LT should never be called
    405      * on a character stream anyway I think. If it is then, the void * will need to be
    406      * cast back in a similar manner. Yuck! But this means that LT for Token streams and
    407      * tree streams is correct.
    408      */
    409     return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
    410 }
    411 
    412 /** \brief Calculate the current index in the output stream.
    413  * \param[in] input Input stream context pointer
    414  */
    415 static ANTLR3_MARKER
    416 antlr38BitIndex(pANTLR3_INT_STREAM is)
    417 {
    418     pANTLR3_INPUT_STREAM input;
    419 
    420     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    421 
    422     return  (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
    423 }
    424 
    425 /** \brief Return the size of the current input stream, as an 8Bit file
    426  *   which in this case is the total input. Other implementations may provide
    427  *   more sophisticated implementations to deal with non-recoverable streams
    428  *   and so on.
    429  *
    430  * \param[in] input Input stream context pointer
    431  */
    432 static	ANTLR3_UINT32
    433 antlr38BitSize(pANTLR3_INPUT_STREAM input)
    434 {
    435     return  input->sizeBuf;
    436 }
    437 
    438 /** \brief Mark the current input point in an 8Bit 8 bit stream
    439  *  such as a file stream, where all the input is available in the
    440  *  buffer.
    441  *
    442  * \param[in] is Input stream context pointer
    443  */
    444 static ANTLR3_MARKER
    445 antlr38BitMark	(pANTLR3_INT_STREAM is)
    446 {
    447     pANTLR3_LEX_STATE	    state;
    448     pANTLR3_INPUT_STREAM    input;
    449 
    450     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    451 
    452     /* New mark point
    453      */
    454     input->markDepth++;
    455 
    456     /* See if we are revisiting a mark as we can just reuse the vector
    457      * entry if we are, otherwise, we need a new one
    458      */
    459     if	(input->markDepth > input->markers->count)
    460     {
    461 	state	= ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
    462 
    463 	/* Add it to the table
    464 	 */
    465 	input->markers->add(input->markers, state, ANTLR3_FREE_FUNC);	/* No special structure, just free() on delete */
    466     }
    467     else
    468     {
    469 	state	= (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
    470 
    471 	/* Assume no errors for speed, it will just blow up if the table failed
    472 	 * for some reasons, hence lots of unit tests on the tables ;-)
    473 	 */
    474     }
    475 
    476     /* We have created or retrieved the state, so update it with the current
    477      * elements of the lexer state.
    478      */
    479     state->charPositionInLine	= input->charPositionInLine;
    480     state->currentLine		= input->currentLine;
    481     state->line			= input->line;
    482     state->nextChar		= input->nextChar;
    483 
    484     is->lastMarker  = input->markDepth;
    485 
    486     /* And that's it
    487      */
    488     return  input->markDepth;
    489 }
    490 /** \brief Rewind the lexer input to the state specified by the last produced mark.
    491  *
    492  * \param[in] input Input stream context pointer
    493  *
    494  * \remark
    495  * Assumes 8 Bit input stream.
    496  */
    497 static void
    498 antlr38BitRewindLast	(pANTLR3_INT_STREAM is)
    499 {
    500     is->rewind(is, is->lastMarker);
    501 }
    502 
    503 /** \brief Rewind the lexer input to the state specified by the supplied mark.
    504  *
    505  * \param[in] input Input stream context pointer
    506  *
    507  * \remark
    508  * Assumes 8 Bit input stream.
    509  */
    510 static void
    511 antlr38BitRewind	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
    512 {
    513     pANTLR3_LEX_STATE	state;
    514     pANTLR3_INPUT_STREAM input;
    515 
    516     input   = ((pANTLR3_INPUT_STREAM) is->super);
    517 
    518     /* Perform any clean up of the marks
    519      */
    520     input->istream->release(input->istream, mark);
    521 
    522     /* Find the supplied mark state
    523      */
    524     state   = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
    525 
    526     /* Seek input pointer to the requested point (note we supply the void *pointer
    527      * to whatever is implementing the int stream to seek).
    528      */
    529     antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
    530 
    531     /* Reset to the reset of the information in the mark
    532      */
    533     input->charPositionInLine	= state->charPositionInLine;
    534     input->currentLine		= state->currentLine;
    535     input->line			= state->line;
    536     input->nextChar		= state->nextChar;
    537 
    538     /* And we are done
    539      */
    540 }
    541 
    542 /** \brief Rewind the lexer input to the state specified by the supplied mark.
    543  *
    544  * \param[in] input Input stream context pointer
    545  *
    546  * \remark
    547  * Assumes 8 Bit input stream.
    548  */
    549 static void
    550 antlr38BitRelease	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
    551 {
    552     pANTLR3_INPUT_STREAM input;
    553 
    554     input   = ((pANTLR3_INPUT_STREAM) (is->super));
    555 
    556     /* We don't do much here in fact as we never free any higher marks in
    557      * the hashtable as we just resuse any memory allocated for them.
    558      */
    559     input->markDepth	= (ANTLR3_UINT32)(mark - 1);
    560 }
    561 
    562 /** \brief Rewind the lexer input to the state specified by the supplied mark.
    563  *
    564  * \param[in] input Input stream context pointer
    565  *
    566  * \remark
    567  * Assumes 8 Bit input stream.
    568  */
    569 static void
    570 antlr38BitSeek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
    571 {
    572 	ANTLR3_INT32   count;
    573 	pANTLR3_INPUT_STREAM input;
    574 
    575 	input   = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
    576 
    577 	/* If the requested seek point is less than the current
    578 	* input point, then we assume that we are resetting from a mark
    579 	* and do not need to scan, but can just set to there.
    580 	*/
    581 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
    582 	{
    583 		input->nextChar	= ((pANTLR3_UINT8) seekPoint);
    584 	}
    585 	else
    586 	{
    587 		count	= (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
    588 
    589 		while (count--)
    590 		{
    591 			is->consume(is);
    592 		}
    593 	}
    594 }
    595 /** Return a substring of the 8 bit input stream in
    596  *  newly allocated memory.
    597  *
    598  * \param input Input stream context pointer
    599  * \param start Offset in input stream where the string starts
    600  * \param stop  Offset in the input stream where the string ends.
    601  */
    602 static pANTLR3_STRING
    603 antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
    604 {
    605 	return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
    606 }
    607 
    608 /** \brief Return the line number as understood by the 8 bit input stream.
    609  *
    610  * \param input Input stream context pointer
    611  * \return	Line number in input stream that we believe we are working on.
    612  */
    613 static ANTLR3_UINT32
    614 antlr38BitGetLine		(pANTLR3_INPUT_STREAM input)
    615 {
    616     return  input->line;
    617 }
    618 
    619 /** Return a pointer into the input stream that points at the start
    620  *  of the current input line as triggered by the end of line character installed
    621  *  for the stream ('\n' unless told differently).
    622  *
    623  * \param[in] input
    624  */
    625 static void	  *
    626 antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input)
    627 {
    628     return  input->currentLine;
    629 }
    630 
    631 /** Return the current offset in to the current line in the input stream.
    632  *
    633  * \param input Input stream context pointer
    634  * \return      Current line offset
    635  */
    636 static ANTLR3_UINT32
    637 antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input)
    638 {
    639     return  input->charPositionInLine;
    640 }
    641 
    642 /** Set the current line number as understood by the input stream.
    643  *
    644  * \param input Input stream context pointer
    645  * \param line  Line number to tell the input stream we are on
    646  *
    647  * \remark
    648  *  This function does not change any pointers, it just allows the programmer to set the
    649  *  line number according to some external criterion, such as finding a lexed directive
    650  *  like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
    651  *  with some original source format.
    652  */
    653 static void
    654 antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
    655 {
    656     input->line	= line;
    657 }
    658 
    659 /** Set the current offset in the current line to be a particular setting.
    660  *
    661  * \param[in] input    Input stream context pointer
    662  * \param[in] position New setting for current offset.
    663  *
    664  * \remark
    665  * This does not set the actual pointers in the input stream, it is purely for reporting
    666  * purposes and so on as per antlr38BitSetLine();
    667  */
    668 static void
    669 antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
    670 {
    671     input->charPositionInLine = position;
    672 }
    673 
    674 /** Set the newline trigger character in the input stream to the supplied parameter.
    675  *
    676  * \param[in] input	    Input stream context pointer
    677  * \param[in] newlineChar   Character to set to be the newline trigger.
    678  *
    679  * \remark
    680  *  - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
    681  *    are the same encodings), but the input stream catered to by this function is 8 bit
    682  *    only, so it is up to the programmer to ensure that the character supplied is valid.
    683  */
    684 static void
    685 antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
    686 {
    687     input->newlineChar	= newlineChar;
    688 }
    689 
    690 
    691 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
    692 ///
    693 /// \param input Input stream context pointer
    694 ///
    695 /// \remark
    696 ///  - Strictly speaking, there is no such thing as a UCS2 input stream as the term
    697 ///    tends to confuse the notions of character encoding, unicode and so on. UCS2 is
    698 ///    essentially UTF16 without any surrogates and so the standard UTF16
    699 ///    input stream is able to handle it without any special code.
    700 ///
    701 void
    702 antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
    703 {
    704     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
    705     // part of the ANTLR3 string. The string factory is then passed through the whole chain
    706     // of lexer->parser->tree->treeparser and so on.
    707     //
    708     input->strFactory	= antlr3StringFactoryNew(input->encoding);
    709 
    710     // Generic API that does not care about endianess.
    711     //
    712     input->istream->index	    =  antlr3UTF16Index;            // Calculate current index in input stream, UTF16 based
    713     input->substr		    =  antlr3UTF16Substr;	    // Return a string from the input stream
    714     input->istream->seek	    =  antlr3UTF16Seek;		    // How to seek to a specific point in the stream
    715 
    716     // We must install different UTF16 routines according to whether the input
    717     // is the same endianess as the machine we are executing upon or not. If it is not
    718     // then we must install methods that can convert the endianess on the fly as they go
    719     //
    720 
    721     switch (machineBigEndian)
    722     {
    723         case    ANTLR3_TRUE:
    724 
    725             // Machine is Big Endian, if the input is also then install the
    726             // methods that do not access input by bytes and reverse them.
    727             // Otherwise install endian aware methods.
    728             //
    729             if  (inputBigEndian == ANTLR3_TRUE)
    730             {
    731                 // Input is machine compatible
    732                 //
    733                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
    734                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
    735             }
    736             else
    737             {
    738                 // Need to use methods that know that the input is little endian
    739                 //
    740                 input->istream->consume	    =  antlr3UTF16ConsumeLE;	    // Consume the next UTF16 character in the buffer
    741                 input->istream->_LA         =  antlr3UTF16LALE;		    // Return the UTF32 character at offset n (1 based)
    742             }
    743             break;
    744 
    745         case    ANTLR3_FALSE:
    746 
    747             // Machine is Little Endian, if the input is also then install the
    748             // methods that do not access input by bytes and reverse them.
    749             // Otherwise install endian aware methods.
    750             //
    751             if  (inputBigEndian == ANTLR3_FALSE)
    752             {
    753                 // Input is machine compatible
    754                 //
    755                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
    756                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
    757             }
    758             else
    759             {
    760                 // Need to use methods that know that the input is Big Endian
    761                 //
    762                 input->istream->consume	    =  antlr3UTF16ConsumeBE;	    // Consume the next UTF16 character in the buffer
    763                 input->istream->_LA         =  antlr3UTF16LABE;		    // Return the UTF32 character at offset n (1 based)
    764             }
    765             break;
    766     }
    767 
    768 
    769     input->charByteSize		    = 2;			    // Size in bytes of characters in this stream.
    770 
    771 }
    772 
    773 /// \brief Consume the next character in a UTF16 input stream
    774 ///
    775 /// \param input Input stream context pointer
    776 ///
    777 static void
    778 antlr3UTF16Consume(pANTLR3_INT_STREAM is)
    779 {
    780 	pANTLR3_INPUT_STREAM input;
    781         UTF32   ch;
    782         UTF32   ch2;
    783 
    784 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
    785 
    786         // Buffer size is always in bytes
    787         //
    788 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    789 	{
    790 		// Indicate one more character in this line
    791 		//
    792 		input->charPositionInLine++;
    793 
    794 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
    795 		{
    796 			// Reset for start of a new line of input
    797 			//
    798 			input->line++;
    799 			input->charPositionInLine	= 0;
    800 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
    801 		}
    802 
    803 		// Increment to next character position, accounting for any surrogates
    804 		//
    805                 // Next char in natural machine byte order
    806                 //
    807                 ch  = *((UTF16*)input->nextChar);
    808 
    809                 // We consumed one 16 bit character
    810                 //
    811 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
    812 
    813                 // If we have a surrogate pair then we need to consume
    814                 // a following valid LO surrogate.
    815                 //
    816                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    817 
    818                     // If the 16 bits following the high surrogate are in the source buffer...
    819                     //
    820                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    821                     {
    822                         // Next character is in natural machine byte order
    823                         //
    824                         ch2 = *((UTF16*)input->nextChar);
    825 
    826                         // If it's a valid low surrogate, consume it
    827                         //
    828                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
    829                         {
    830                             // We consumed one 16 bit character
    831                             //
    832 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
    833                         }
    834                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    835                         // it.
    836                         //
    837                     }
    838                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    839                     // it because the buffer ended
    840                     //
    841                 }
    842                 // Note that we did not check for an invalid low surrogate here, or that fact that the
    843                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
    844                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
    845                 //
    846 	}
    847 }
    848 
    849 /// \brief Return the input element assuming an 8 bit ascii input
    850 ///
    851 /// \param[in] input Input stream context pointer
    852 /// \param[in] la 1 based offset of next input stream element
    853 ///
    854 /// \return Next input character in internal ANTLR3 encoding (UTF32)
    855 ///
    856 static ANTLR3_UCHAR
    857 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
    858 {
    859 	pANTLR3_INPUT_STREAM input;
    860         UTF32   ch;
    861         UTF32   ch2;
    862         UTF16   * nextChar;
    863 
    864         // Find the input interface and where we are currently pointing to
    865         // in the input stream
    866         //
    867 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
    868         nextChar    = input->nextChar;
    869 
    870         // If a positive offset then advance forward, else retreat
    871         //
    872         if  (la >= 0)
    873         {
    874             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
    875             {
    876                 // Advance our copy of the input pointer
    877                 //
    878                 // Next char in natural machine byte order
    879                 //
    880                 ch  = *nextChar++;
    881 
    882                 // If we have a surrogate pair then we need to consume
    883                 // a following valid LO surrogate.
    884                 //
    885                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
    886                 {
    887                     // If the 16 bits following the high surrogate are in the source buffer...
    888                     //
    889                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    890                     {
    891                         // Next character is in natural machine byte order
    892                         //
    893                         ch2 = *nextChar;
    894 
    895                         // If it's a valid low surrogate, consume it
    896                         //
    897                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
    898                         {
    899                             // We consumed one 16 bit character
    900                             //
    901 		            nextChar++;
    902                         }
    903                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    904                         // it.
    905                         //
    906                     }
    907                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    908                     // it because the buffer ended
    909                     //
    910                 }
    911                 // Note that we did not check for an invalid low surrogate here, or that fact that the
    912                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
    913                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
    914                 //
    915             }
    916         }
    917         else
    918         {
    919             // We need to go backwards from our input point
    920             //
    921             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
    922             {
    923                 // Get the previous 16 bit character
    924                 //
    925                 ch = *--nextChar;
    926 
    927                 // If we found a low surrogate then go back one more character if
    928                 // the hi surrogate is there
    929                 //
    930                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
    931                 {
    932                     ch2 = *(nextChar-1);
    933                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
    934                     {
    935                         // Yes, there is a high surrogate to match it so decrement one more and point to that
    936                         //
    937                         nextChar--;
    938                     }
    939                 }
    940             }
    941         }
    942 
    943         // Our local copy of nextChar is now pointing to either the correct character or end of file
    944         //
    945         // Input buffer size is always in bytes
    946         //
    947 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    948 	{
    949 		return	ANTLR3_CHARSTREAM_EOF;
    950 	}
    951 	else
    952 	{
    953             // Pick up the next 16 character (native machine byte order)
    954             //
    955             ch = *nextChar++;
    956 
    957             // If we have a surrogate pair then we need to consume
    958             // a following valid LO surrogate.
    959             //
    960             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
    961             {
    962                 // If the 16 bits following the high surrogate are in the source buffer...
    963                 //
    964                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
    965                 {
    966                     // Next character is in natural machine byte order
    967                     //
    968                     ch2 = *nextChar;
    969 
    970                     // If it's a valid low surrogate, consume it
    971                     //
    972                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
    973                     {
    974                         // Construct the UTF32 code point
    975                         //
    976                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    977 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
    978                     }
    979                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    980                     // it.
    981                     //
    982                 }
    983                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
    984                 // it because the buffer ended
    985                 //
    986             }
    987         }
    988         return ch;
    989 }
    990 
    991 
    992 /// \brief Calculate the current index in the output stream.
    993 /// \param[in] input Input stream context pointer
    994 ///
    995 static ANTLR3_MARKER
    996 antlr3UTF16Index(pANTLR3_INT_STREAM is)
    997 {
    998     pANTLR3_INPUT_STREAM input;
    999 
   1000     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1001 
   1002     return  (ANTLR3_MARKER)(input->nextChar);
   1003 }
   1004 
   1005 /// \brief Rewind the lexer input to the state specified by the supplied mark.
   1006 ///
   1007 /// \param[in] input Input stream context pointer
   1008 ///
   1009 /// \remark
   1010 /// Assumes UTF16 input stream.
   1011 ///
   1012 static void
   1013 antlr3UTF16Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
   1014 {
   1015 	pANTLR3_INPUT_STREAM input;
   1016 
   1017 	input   = ((pANTLR3_INPUT_STREAM) is->super);
   1018 
   1019 	// If the requested seek point is less than the current
   1020 	// input point, then we assume that we are resetting from a mark
   1021 	// and do not need to scan, but can just set to there as rewind will
   1022         // reset line numbers and so on.
   1023 	//
   1024 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
   1025 	{
   1026 		input->nextChar	= (void *)seekPoint;
   1027 	}
   1028 	else
   1029 	{
   1030             // Call consume until we reach the asked for seek point or EOF
   1031             //
   1032             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
   1033 	    {
   1034 		is->consume(is);
   1035 	    }
   1036 	}
   1037 }
   1038 /// \brief Return a substring of the UTF16 input stream in
   1039 ///  newly allocated memory.
   1040 ///
   1041 /// \param input Input stream context pointer
   1042 /// \param start Offset in input stream where the string starts
   1043 /// \param stop  Offset in the input stream where the string ends.
   1044 ///
   1045 static pANTLR3_STRING
   1046 antlr3UTF16Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
   1047 {
   1048     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
   1049 }
   1050 
   1051 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
   1052 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
   1053 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
   1054 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
   1055 /// is fubar but we just ignore that.
   1056 ///
   1057 /// \param input Input stream context pointer
   1058 ///
   1059 static void
   1060 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
   1061 {
   1062 	pANTLR3_INPUT_STREAM input;
   1063         UTF32   ch;
   1064         UTF32   ch2;
   1065 
   1066 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1067 
   1068         // Buffer size is always in bytes
   1069         //
   1070 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1071 	{
   1072 		// Indicate one more character in this line
   1073 		//
   1074 		input->charPositionInLine++;
   1075 
   1076 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
   1077 		{
   1078 			// Reset for start of a new line of input
   1079 			//
   1080 			input->line++;
   1081 			input->charPositionInLine	= 0;
   1082 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1083 		}
   1084 
   1085 		// Increment to next character position, accounting for any surrogates
   1086 		//
   1087                 // Next char in litle endian form
   1088                 //
   1089                 ch  = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
   1090 
   1091                 // We consumed one 16 bit character
   1092                 //
   1093 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1094 
   1095                 // If we have a surrogate pair then we need to consume
   1096                 // a following valid LO surrogate.
   1097                 //
   1098                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
   1099 
   1100                     // If the 16 bits following the high surrogate are in the source buffer...
   1101                     //
   1102                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1103                     {
   1104                         ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
   1105 
   1106                         // If it's a valid low surrogate, consume it
   1107                         //
   1108                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1109                         {
   1110                             // We consumed one 16 bit character
   1111                             //
   1112 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1113                         }
   1114                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1115                         // it.
   1116                         //
   1117                     }
   1118                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1119                     // it because the buffer ended
   1120                     //
   1121                 }
   1122                 // Note that we did not check for an invalid low surrogate here, or that fact that the
   1123                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
   1124                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
   1125                 //
   1126 	}
   1127 }
   1128 
   1129 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
   1130 ///
   1131 /// \param[in] input Input stream context pointer
   1132 /// \param[in] la 1 based offset of next input stream element
   1133 ///
   1134 /// \return Next input character in internal ANTLR3 encoding (UTF32)
   1135 ///
   1136 static ANTLR3_UCHAR
   1137 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1138 {
   1139 	pANTLR3_INPUT_STREAM input;
   1140         UTF32           ch;
   1141         UTF32           ch2;
   1142         pANTLR3_UCHAR   nextChar;
   1143 
   1144         // Find the input interface and where we are currently pointing to
   1145         // in the input stream
   1146         //
   1147 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
   1148         nextChar    = input->nextChar;
   1149 
   1150         // If a positive offset then advance forward, else retreat
   1151         //
   1152         if  (la >= 0)
   1153         {
   1154             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
   1155             {
   1156                 // Advance our copy of the input pointer
   1157                 //
   1158                 // Next char in Little Endian byte order
   1159                 //
   1160                 ch  = (*nextChar) + (*(nextChar+1) << 8);
   1161                 nextChar += 2;
   1162 
   1163                 // If we have a surrogate pair then we need to consume
   1164                 // a following valid LO surrogate.
   1165                 //
   1166                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
   1167                 {
   1168                     // If the 16 bits following the high surrogate are in the source buffer...
   1169                     //
   1170                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1171                     {
   1172                         // Next character is in little endian byte order
   1173                         //
   1174                         ch2 = (*nextChar) + (*(nextChar+1) << 8);
   1175 
   1176                         // If it's a valid low surrogate, consume it
   1177                         //
   1178                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1179                         {
   1180                             // We consumed one 16 bit character
   1181                             //
   1182 		            nextChar += 2;
   1183                         }
   1184                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1185                         // it.
   1186                         //
   1187                     }
   1188                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1189                     // it because the buffer ended
   1190                     //
   1191                 }
   1192                 // Note that we did not check for an invalid low surrogate here, or that fact that the
   1193                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
   1194                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
   1195                 //
   1196             }
   1197         }
   1198         else
   1199         {
   1200             // We need to go backwards from our input point
   1201             //
   1202             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
   1203             {
   1204                 // Get the previous 16 bit character
   1205                 //
   1206                 ch = (*nextChar - 2) + ((*nextChar -1) << 8);
   1207                 nextChar -= 2;
   1208 
   1209                 // If we found a low surrogate then go back one more character if
   1210                 // the hi surrogate is there
   1211                 //
   1212                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
   1213                 {
   1214                     ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
   1215                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
   1216                     {
   1217                         // Yes, there is a high surrogate to match it so decrement one more and point to that
   1218                         //
   1219                         nextChar -=2;
   1220                     }
   1221                 }
   1222             }
   1223         }
   1224 
   1225         // Our local copy of nextChar is now pointing to either the correct character or end of file
   1226         //
   1227         // Input buffer size is always in bytes
   1228         //
   1229 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1230 	{
   1231 		return	ANTLR3_CHARSTREAM_EOF;
   1232 	}
   1233 	else
   1234 	{
   1235             // Pick up the next 16 character (little endian byte order)
   1236             //
   1237             ch = (*nextChar) + (*(nextChar+1) << 8);
   1238             nextChar += 2;
   1239 
   1240             // If we have a surrogate pair then we need to consume
   1241             // a following valid LO surrogate.
   1242             //
   1243             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
   1244             {
   1245                 // If the 16 bits following the high surrogate are in the source buffer...
   1246                 //
   1247                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1248                 {
   1249                     // Next character is in little endian byte order
   1250                     //
   1251                     ch2 = (*nextChar) + (*(nextChar+1) << 8);
   1252 
   1253                     // If it's a valid low surrogate, consume it
   1254                     //
   1255                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1256                     {
   1257                         // Construct the UTF32 code point
   1258                         //
   1259                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
   1260 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
   1261                     }
   1262                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1263                     // it.
   1264                     //
   1265                 }
   1266                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1267                 // it because the buffer ended
   1268                 //
   1269             }
   1270         }
   1271         return ch;
   1272 }
   1273 
   1274 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
   1275 ///
   1276 /// \param input Input stream context pointer
   1277 ///
   1278 static void
   1279 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
   1280 {
   1281 	pANTLR3_INPUT_STREAM input;
   1282         UTF32   ch;
   1283         UTF32   ch2;
   1284 
   1285 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1286 
   1287         // Buffer size is always in bytes
   1288         //
   1289 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1290 	{
   1291 		// Indicate one more character in this line
   1292 		//
   1293 		input->charPositionInLine++;
   1294 
   1295 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
   1296 		{
   1297 			// Reset for start of a new line of input
   1298 			//
   1299 			input->line++;
   1300 			input->charPositionInLine	= 0;
   1301 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1302 		}
   1303 
   1304 		// Increment to next character position, accounting for any surrogates
   1305 		//
   1306                 // Next char in big endian form
   1307                 //
   1308                 ch  = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
   1309 
   1310                 // We consumed one 16 bit character
   1311                 //
   1312 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1313 
   1314                 // If we have a surrogate pair then we need to consume
   1315                 // a following valid LO surrogate.
   1316                 //
   1317                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
   1318 
   1319                     // If the 16 bits following the high surrogate are in the source buffer...
   1320                     //
   1321                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1322                     {
   1323                         // Big endian
   1324                         //
   1325                         ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
   1326 
   1327                         // If it's a valid low surrogate, consume it
   1328                         //
   1329                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1330                         {
   1331                             // We consumed one 16 bit character
   1332                             //
   1333 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
   1334                         }
   1335                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1336                         // it.
   1337                         //
   1338                     }
   1339                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1340                     // it because the buffer ended
   1341                     //
   1342                 }
   1343                 // Note that we did not check for an invalid low surrogate here, or that fact that the
   1344                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
   1345                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
   1346                 //
   1347 	}
   1348 }
   1349 
   1350 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
   1351 ///
   1352 /// \param[in] input Input stream context pointer
   1353 /// \param[in] la 1 based offset of next input stream element
   1354 ///
   1355 /// \return Next input character in internal ANTLR3 encoding (UTF32)
   1356 ///
   1357 static ANTLR3_UCHAR
   1358 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1359 {
   1360 	pANTLR3_INPUT_STREAM input;
   1361         UTF32           ch;
   1362         UTF32           ch2;
   1363         pANTLR3_UCHAR   nextChar;
   1364 
   1365         // Find the input interface and where we are currently pointing to
   1366         // in the input stream
   1367         //
   1368 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
   1369         nextChar    = input->nextChar;
   1370 
   1371         // If a positive offset then advance forward, else retreat
   1372         //
   1373         if  (la >= 0)
   1374         {
   1375             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
   1376             {
   1377                 // Advance our copy of the input pointer
   1378                 //
   1379                 // Next char in Big Endian byte order
   1380                 //
   1381                 ch  = ((*nextChar) << 8) + *(nextChar+1);
   1382                 nextChar += 2;
   1383 
   1384                 // If we have a surrogate pair then we need to consume
   1385                 // a following valid LO surrogate.
   1386                 //
   1387                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
   1388                 {
   1389                     // If the 16 bits following the high surrogate are in the source buffer...
   1390                     //
   1391                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1392                     {
   1393                         // Next character is in big endian byte order
   1394                         //
   1395                         ch2 = ((*nextChar) << 8) + *(nextChar+1);
   1396 
   1397                         // If it's a valid low surrogate, consume it
   1398                         //
   1399                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1400                         {
   1401                             // We consumed one 16 bit character
   1402                             //
   1403 		            nextChar += 2;
   1404                         }
   1405                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1406                         // it.
   1407                         //
   1408                     }
   1409                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1410                     // it because the buffer ended
   1411                     //
   1412                 }
   1413                 // Note that we did not check for an invalid low surrogate here, or that fact that the
   1414                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
   1415                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
   1416                 //
   1417             }
   1418         }
   1419         else
   1420         {
   1421             // We need to go backwards from our input point
   1422             //
   1423             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
   1424             {
   1425                 // Get the previous 16 bit character
   1426                 //
   1427                 ch = ((*nextChar - 2) << 8) + (*nextChar -1);
   1428                 nextChar -= 2;
   1429 
   1430                 // If we found a low surrogate then go back one more character if
   1431                 // the hi surrogate is there
   1432                 //
   1433                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
   1434                 {
   1435                     ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
   1436                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
   1437                     {
   1438                         // Yes, there is a high surrogate to match it so decrement one more and point to that
   1439                         //
   1440                         nextChar -=2;
   1441                     }
   1442                 }
   1443             }
   1444         }
   1445 
   1446         // Our local copy of nextChar is now pointing to either the correct character or end of file
   1447         //
   1448         // Input buffer size is always in bytes
   1449         //
   1450 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1451 	{
   1452 		return	ANTLR3_CHARSTREAM_EOF;
   1453 	}
   1454 	else
   1455 	{
   1456             // Pick up the next 16 character (big endian byte order)
   1457             //
   1458             ch = ((*nextChar) << 8) + *(nextChar+1);
   1459             nextChar += 2;
   1460 
   1461             // If we have a surrogate pair then we need to consume
   1462             // a following valid LO surrogate.
   1463             //
   1464             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
   1465             {
   1466                 // If the 16 bits following the high surrogate are in the source buffer...
   1467                 //
   1468                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1469                 {
   1470                     // Next character is in big endian byte order
   1471                     //
   1472                     ch2 = ((*nextChar) << 8) + *(nextChar+1);
   1473 
   1474                     // If it's a valid low surrogate, consume it
   1475                     //
   1476                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
   1477                     {
   1478                         // Construct the UTF32 code point
   1479                         //
   1480                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
   1481 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
   1482                     }
   1483                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1484                     // it.
   1485                     //
   1486                 }
   1487                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
   1488                 // it because the buffer ended
   1489                 //
   1490             }
   1491         }
   1492         return ch;
   1493 }
   1494 
   1495 /// \brief Common function to setup function interface for a UTF3 input stream.
   1496 ///
   1497 /// \param input Input stream context pointer
   1498 ///
   1499 void
   1500 antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
   1501 {
   1502     // Build a string factory for this stream. This is a UTF32 string factory which is a standard
   1503     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
   1504     // and so on.
   1505     //
   1506     input->strFactory	= antlr3StringFactoryNew(input->encoding);
   1507 
   1508     // Generic API that does not care about endianess.
   1509     //
   1510     input->istream->index	    =  antlr3UTF32Index;            // Calculate current index in input stream, UTF16 based
   1511     input->substr		    =  antlr3UTF32Substr;	    // Return a string from the input stream
   1512     input->istream->seek	    =  antlr3UTF32Seek;		    // How to seek to a specific point in the stream
   1513     input->istream->consume	    =  antlr3UTF32Consume;	    // Consume the next UTF32 character in the buffer
   1514 
   1515     // We must install different UTF32 LA routines according to whether the input
   1516     // is the same endianess as the machine we are executing upon or not. If it is not
   1517     // then we must install methods that can convert the endianess on the fly as they go
   1518     //
   1519     switch (machineBigEndian)
   1520     {
   1521         case    ANTLR3_TRUE:
   1522 
   1523             // Machine is Big Endian, if the input is also then install the
   1524             // methods that do not access input by bytes and reverse them.
   1525             // Otherwise install endian aware methods.
   1526             //
   1527             if  (inputBigEndian == ANTLR3_TRUE)
   1528             {
   1529                 // Input is machine compatible
   1530                 //
   1531                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
   1532             }
   1533             else
   1534             {
   1535                 // Need to use methods that know that the input is little endian
   1536                 //
   1537                 input->istream->_LA         =  antlr3UTF32LALE;		    // Return the UTF32 character at offset n (1 based)
   1538             }
   1539             break;
   1540 
   1541         case    ANTLR3_FALSE:
   1542 
   1543             // Machine is Little Endian, if the input is also then install the
   1544             // methods that do not access input by bytes and reverse them.
   1545             // Otherwise install endian aware methods.
   1546             //
   1547             if  (inputBigEndian == ANTLR3_FALSE)
   1548             {
   1549                 // Input is machine compatible
   1550                 //
   1551                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
   1552             }
   1553             else
   1554             {
   1555                 // Need to use methods that know that the input is Big Endian
   1556                 //
   1557                 input->istream->_LA         =  antlr3UTF32LABE;		    // Return the UTF32 character at offset n (1 based)
   1558             }
   1559             break;
   1560     }
   1561 
   1562     input->charByteSize		    = 4;			    // Size in bytes of characters in this stream.
   1563 }
   1564 
   1565 /** \brief Consume the next character in a UTF32 input stream
   1566  *
   1567  * \param input Input stream context pointer
   1568  */
   1569 static void
   1570 antlr3UTF32Consume(pANTLR3_INT_STREAM is)
   1571 {
   1572     pANTLR3_INPUT_STREAM input;
   1573 
   1574     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1575 
   1576     // SizeBuf is always in bytes
   1577     //
   1578     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1579     {
   1580 	/* Indicate one more character in this line
   1581 	 */
   1582 	input->charPositionInLine++;
   1583 
   1584 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
   1585 	{
   1586 	    /* Reset for start of a new line of input
   1587 	     */
   1588 	    input->line++;
   1589 	    input->charPositionInLine	= 0;
   1590 	    input->currentLine		= (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
   1591 	}
   1592 
   1593 	/* Increment to next character position
   1594 	 */
   1595 	input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
   1596     }
   1597 }
   1598 
   1599 /// \brief Calculate the current index in the output stream.
   1600 /// \param[in] input Input stream context pointer
   1601 ///
   1602 static ANTLR3_MARKER
   1603 antlr3UTF32Index(pANTLR3_INT_STREAM is)
   1604 {
   1605     pANTLR3_INPUT_STREAM input;
   1606 
   1607     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1608 
   1609     return  (ANTLR3_MARKER)(input->nextChar);
   1610 }
   1611 
   1612 /// \brief Return a substring of the UTF16 input stream in
   1613 ///  newly allocated memory.
   1614 ///
   1615 /// \param input Input stream context pointer
   1616 /// \param start Offset in input stream where the string starts
   1617 /// \param stop  Offset in the input stream where the string ends.
   1618 ///
   1619 static pANTLR3_STRING
   1620 antlr3UTF32Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
   1621 {
   1622     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
   1623 }
   1624 
   1625 /// \brief Rewind the lexer input to the state specified by the supplied mark.
   1626 ///
   1627 /// \param[in] input Input stream context pointer
   1628 ///
   1629 /// \remark
   1630 /// Assumes UTF32 input stream.
   1631 ///
   1632 static void
   1633 antlr3UTF32Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
   1634 {
   1635 	pANTLR3_INPUT_STREAM input;
   1636 
   1637 	input   = ((pANTLR3_INPUT_STREAM) is->super);
   1638 
   1639 	// If the requested seek point is less than the current
   1640 	// input point, then we assume that we are resetting from a mark
   1641 	// and do not need to scan, but can just set to there as rewind will
   1642         // reset line numbers and so on.
   1643 	//
   1644 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
   1645 	{
   1646 		input->nextChar	= (void *)seekPoint;
   1647 	}
   1648 	else
   1649 	{
   1650             // Call consume until we reach the asked for seek point or EOF
   1651             //
   1652             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
   1653 	    {
   1654 		is->consume(is);
   1655 	    }
   1656 	}
   1657 }
   1658 
   1659 /** \brief Return the input element assuming a UTF32 input in natural machine byte order
   1660  *
   1661  * \param[in] input Input stream context pointer
   1662  * \param[in] la 1 based offset of next input stream element
   1663  *
   1664  * \return Next input character in internal ANTLR3 encoding (UTF32)
   1665  */
   1666 static ANTLR3_UCHAR
   1667 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1668 {
   1669     pANTLR3_INPUT_STREAM input;
   1670 
   1671     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1672 
   1673     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1674     {
   1675 		return	ANTLR3_CHARSTREAM_EOF;
   1676     }
   1677     else
   1678     {
   1679 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
   1680     }
   1681 }
   1682 
   1683 /** \brief Return the input element assuming a UTF32 input in little endian byte order
   1684  *
   1685  * \param[in] input Input stream context pointer
   1686  * \param[in] la 1 based offset of next input stream element
   1687  *
   1688  * \return Next input character in internal ANTLR3 encoding (UTF32)
   1689  */
   1690 static ANTLR3_UCHAR
   1691 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1692 {
   1693     pANTLR3_INPUT_STREAM input;
   1694 
   1695     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1696 
   1697     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1698     {
   1699 		return	ANTLR3_CHARSTREAM_EOF;
   1700     }
   1701     else
   1702     {
   1703         ANTLR3_UCHAR   c;
   1704 
   1705         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
   1706 
   1707         // Swap Endianess to Big Endian
   1708         //
   1709         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
   1710     }
   1711 }
   1712 
   1713 /** \brief Return the input element assuming a UTF32 input in big endian byte order
   1714  *
   1715  * \param[in] input Input stream context pointer
   1716  * \param[in] la 1 based offset of next input stream element
   1717  *
   1718  * \return Next input character in internal ANTLR3 encoding (UTF32)
   1719  * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
   1720  */
   1721 static ANTLR3_UCHAR
   1722 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1723 {
   1724     pANTLR3_INPUT_STREAM input;
   1725 
   1726     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1727 
   1728     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1729     {
   1730 		return	ANTLR3_CHARSTREAM_EOF;
   1731     }
   1732     else
   1733     {
   1734         ANTLR3_UCHAR   c;
   1735 
   1736         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
   1737 
   1738         // Swap Endianess to Little Endian
   1739         //
   1740         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
   1741     }
   1742 }
   1743 
   1744 
   1745 /// \brief Common function to setup function interface for a UTF8 input stream.
   1746 ///
   1747 /// \param input Input stream context pointer
   1748 ///
   1749 void
   1750 antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input)
   1751 {
   1752     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
   1753     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
   1754     // and so on.
   1755     //
   1756     input->strFactory	= antlr3StringFactoryNew(input->encoding);
   1757 
   1758     // Generic API that does not care about endianess.
   1759     //
   1760     input->istream->consume	= antlr3UTF8Consume;	// Consume the next UTF32 character in the buffer
   1761     input->istream->_LA         = antlr3UTF8LA;         // Return the UTF32 character at offset n (1 based)
   1762     input->charByteSize		= 0;	                // Size in bytes of characters in this stream.
   1763 }
   1764 
   1765 // ------------------------------------------------------
   1766 // Following is from Unicode.org (see antlr3convertutf.c)
   1767 //
   1768 
   1769 /// Index into the table below with the first byte of a UTF-8 sequence to
   1770 /// get the number of trailing bytes that are supposed to follow it.
   1771 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
   1772 /// left as-is for anyone who may want to do such conversion, which was
   1773 /// allowed in earlier algorithms.
   1774 ///
   1775 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
   1776     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1777     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1778     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1779     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1780     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1781     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1782     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   1783     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
   1784 };
   1785 
   1786 /// Magic values subtracted from a buffer value during UTF8 conversion.
   1787 /// This table contains as many values as there might be trailing bytes
   1788 /// in a UTF-8 sequence.
   1789 ///
   1790 static const UTF32 offsetsFromUTF8[6] =
   1791     {   0x00000000UL, 0x00003080UL, 0x000E2080UL,
   1792 	0x03C82080UL, 0xFA082080UL, 0x82082080UL
   1793     };
   1794 
   1795 // End of Unicode.org tables
   1796 // -------------------------
   1797 
   1798 
   1799 /** \brief Consume the next character in a UTF8 input stream
   1800  *
   1801  * \param input Input stream context pointer
   1802  */
   1803 static void
   1804 antlr3UTF8Consume(pANTLR3_INT_STREAM is)
   1805 {
   1806     pANTLR3_INPUT_STREAM    input;
   1807     ANTLR3_UINT32           extraBytesToRead;
   1808     ANTLR3_UCHAR            ch;
   1809     pANTLR3_UINT8           nextChar;
   1810 
   1811     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1812 
   1813     nextChar = input->nextChar;
   1814 
   1815     if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1816     {
   1817 	// Indicate one more character in this line
   1818 	//
   1819 	input->charPositionInLine++;
   1820 
   1821         // Are there more bytes needed to make up the whole thing?
   1822         //
   1823         extraBytesToRead = trailingBytesForUTF8[*nextChar];
   1824 
   1825         if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1826         {
   1827             input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
   1828             return;
   1829         }
   1830 
   1831         // Cases deliberately fall through (see note A in antlrconvertutf.c)
   1832         // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
   1833         // we allow it.
   1834         //
   1835         ch  = 0;
   1836        	switch (extraBytesToRead) {
   1837 	    case 5: ch += *nextChar++; ch <<= 6;
   1838 	    case 4: ch += *nextChar++; ch <<= 6;
   1839 	    case 3: ch += *nextChar++; ch <<= 6;
   1840 	    case 2: ch += *nextChar++; ch <<= 6;
   1841 	    case 1: ch += *nextChar++; ch <<= 6;
   1842 	    case 0: ch += *nextChar++;
   1843 	}
   1844 
   1845         // Magically correct the input value
   1846         //
   1847 	ch -= offsetsFromUTF8[extraBytesToRead];
   1848 	if  (ch == input->newlineChar)
   1849 	{
   1850 	    /* Reset for start of a new line of input
   1851 	     */
   1852 	    input->line++;
   1853 	    input->charPositionInLine	= 0;
   1854 	    input->currentLine		= (void *)nextChar;
   1855 	}
   1856 
   1857         // Update input pointer
   1858         //
   1859         input->nextChar = nextChar;
   1860     }
   1861 }
   1862 /** \brief Return the input element assuming a UTF8 input
   1863  *
   1864  * \param[in] input Input stream context pointer
   1865  * \param[in] la 1 based offset of next input stream element
   1866  *
   1867  * \return Next input character in internal ANTLR3 encoding (UTF32)
   1868  */
   1869 static ANTLR3_UCHAR
   1870 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   1871 {
   1872     pANTLR3_INPUT_STREAM    input;
   1873     ANTLR3_UINT32           extraBytesToRead;
   1874     ANTLR3_UCHAR            ch;
   1875     pANTLR3_UINT8           nextChar;
   1876 
   1877     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   1878 
   1879     nextChar = input->nextChar;
   1880 
   1881     // Do we need to traverse forwards or backwards?
   1882     // - LA(0) is treated as LA(1) and we assume that the nextChar is
   1883     //   already positioned.
   1884     // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
   1885     // - LA(-n) means we must traverse backwards n chracters
   1886     //
   1887     if (la > 1) {
   1888 
   1889         // Make sure that we have at least one character left before trying to
   1890         // loop through the buffer.
   1891         //
   1892         if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1893         {
   1894             // Now traverse n-1 characters forward
   1895             //
   1896             while (--la > 0)
   1897             {
   1898                 // Does the next character require trailing bytes?
   1899                 // If so advance the pointer by that many bytes as well as advancing
   1900                 // one position for what will be at least a single byte character.
   1901                 //
   1902                 nextChar += trailingBytesForUTF8[*nextChar] + 1;
   1903 
   1904                 // Does that calculation take us past the byte length of the buffer?
   1905                 //
   1906                 if	(nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1907                 {
   1908                     return ANTLR3_CHARSTREAM_EOF;
   1909                 }
   1910             }
   1911         }
   1912         else
   1913         {
   1914             return ANTLR3_CHARSTREAM_EOF;
   1915         }
   1916     }
   1917     else
   1918     {
   1919         // LA is negative so we decrease the pointer by n character positions
   1920         //
   1921         while   (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
   1922         {
   1923             // Traversing backwards in UTF8 means decermenting by one
   1924             // then continuing to decrement while ever a character pattern
   1925             // is flagged as being a trailing byte of an encoded code point.
   1926             // Trailing UTF8 bytes always start with 10 in binary. We assumne that
   1927             // the UTF8 is well formed and do not check boundary conditions
   1928             //
   1929             nextChar--;
   1930             while ((*nextChar & 0xC0) == 0x80)
   1931             {
   1932                 nextChar--;
   1933             }
   1934         }
   1935     }
   1936 
   1937     // nextChar is now pointing at the UTF8 encoded character that we need to
   1938     // decode and return.
   1939     //
   1940     // Are there more bytes needed to make up the whole thing?
   1941     //
   1942     extraBytesToRead = trailingBytesForUTF8[*nextChar];
   1943     if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   1944     {
   1945         return ANTLR3_CHARSTREAM_EOF;
   1946     }
   1947 
   1948     // Cases deliberately fall through (see note A in antlrconvertutf.c)
   1949     //
   1950     ch  = 0;
   1951     switch (extraBytesToRead) {
   1952             case 5: ch += *nextChar++; ch <<= 6;
   1953             case 4: ch += *nextChar++; ch <<= 6;
   1954             case 3: ch += *nextChar++; ch <<= 6;
   1955             case 2: ch += *nextChar++; ch <<= 6;
   1956             case 1: ch += *nextChar++; ch <<= 6;
   1957             case 0: ch += *nextChar++;
   1958     }
   1959 
   1960     // Magically correct the input value
   1961     //
   1962     ch -= offsetsFromUTF8[extraBytesToRead];
   1963 
   1964     return ch;
   1965 }
   1966 
   1967 // EBCDIC to ASCII conversion table
   1968 //
   1969 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
   1970 // translation and the character tables are published all over the interweb.
   1971 //
   1972 const ANTLR3_UCHAR e2a[256] =
   1973 {
   1974     0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
   1975     0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
   1976     0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
   1977     0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
   1978     0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
   1979     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
   1980     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
   1981     0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
   1982     0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
   1983     0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
   1984     0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
   1985     0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
   1986     0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
   1987     0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
   1988     0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
   1989     0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
   1990     0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
   1991     0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
   1992     0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
   1993     0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
   1994     0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
   1995     0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
   1996     0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
   1997     0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
   1998     0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
   1999     0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
   2000     0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
   2001     0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
   2002     0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
   2003     0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
   2004     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
   2005     0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
   2006 };
   2007 
   2008 /// \brief Common function to setup function interface for a EBCDIC input stream.
   2009 ///
   2010 /// \param input Input stream context pointer
   2011 ///
   2012 void
   2013 antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input)
   2014 {
   2015     // EBCDIC streams can use the standard 8 bit string factory
   2016     //
   2017     input->strFactory	= antlr3StringFactoryNew(input->encoding);
   2018 
   2019     // Generic API that does not care about endianess.
   2020     //
   2021     input->istream->_LA         = antlr3EBCDICLA;       // Return the UTF32 character at offset n (1 based)
   2022     input->charByteSize		= 1;	                // Size in bytes of characters in this stream.
   2023 }
   2024 
   2025 /// \brief Return the input element assuming an 8 bit EBCDIC input
   2026 ///
   2027 /// \param[in] input Input stream context pointer
   2028 /// \param[in] la 1 based offset of next input stream element
   2029 ///
   2030 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
   2031 ///         from EBCDIC to ASCII
   2032 ///
   2033 static ANTLR3_UCHAR
   2034 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
   2035 {
   2036     pANTLR3_INPUT_STREAM input;
   2037 
   2038     input   = ((pANTLR3_INPUT_STREAM) (is->super));
   2039 
   2040     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
   2041     {
   2042         return	ANTLR3_CHARSTREAM_EOF;
   2043     }
   2044     else
   2045     {
   2046         // Translate the required character via the constant conversion table
   2047         //
   2048         return	e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
   2049     }
   2050 }