Home | History | Annotate | Download | only in src
      1 /** \file
      2  * \brief The ANTLR3 C filestream is used when the source character stream
      3  * is a filesystem based input set and all the characters in the filestream
      4  * can be loaded at once into memory and away the lexer goes.
      5  *
      6  * A number of initializers are provided in order that various character
      7  * sets can be supported from input files. The ANTLR3 C runtime expects
      8  * to deal with UTF32 characters only (the reasons for this are to
      9  * do with the simplification of C code when using this form of Unicode
     10  * encoding, though this is not a panacea. More information can be
     11  * found on this by consulting:
     12  *   - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
     13  * Where a well grounded discussion of the encoding formats available
     14  * may be found.
     15  *
     16  */
     17 
     18 // [The "BSD licence"]
     19 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
     20 // http://www.temporal-wave.com
     21 // http://www.linkedin.com/in/jimidle
     22 //
     23 // All rights reserved.
     24 //
     25 // Redistribution and use in source and binary forms, with or without
     26 // modification, are permitted provided that the following conditions
     27 // are met:
     28 // 1. Redistributions of source code must retain the above copyright
     29 //    notice, this list of conditions and the following disclaimer.
     30 // 2. Redistributions in binary form must reproduce the above copyright
     31 //    notice, this list of conditions and the following disclaimer in the
     32 //    documentation and/or other materials provided with the distribution.
     33 // 3. The name of the author may not be used to endorse or promote products
     34 //    derived from this software without specific prior written permission.
     35 //
     36 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     37 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     38 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     39 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     40 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     41 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     42 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     43 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     44 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     45 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     46 
     47 #include    <antlr3.h>
     48 
     49 static  void                    setupInputStream            (pANTLR3_INPUT_STREAM input);
     50 static  pANTLR3_INPUT_STREAM    antlr3CreateFileStream      (pANTLR3_UINT8 fileName);
     51 static  pANTLR3_INPUT_STREAM    antlr3CreateStringStream    (pANTLR3_UINT8 data);
     52 
     53 ANTLR3_API pANTLR3_INPUT_STREAM
     54 antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
     55 {
     56     pANTLR3_INPUT_STREAM input;
     57 
     58     // First order of business is to read the file into some buffer space
     59     // as just straight 8 bit bytes. Then we will work out the encoding and
     60     // byte order and adjust the API functions that are installed for the
     61     // default 8Bit stream accordingly.
     62     //
     63     input   = antlr3CreateFileStream(fileName);
     64     if  (input == NULL)
     65     {
     66         return NULL;
     67     }
     68 
     69     // We have the data in memory now so we can deal with it according to
     70     // the encoding scheme we were given by the user.
     71     //
     72     input->encoding = encoding;
     73 
     74     // Now we need to work out the endian type and install any
     75     // API functions that differ from 8Bit
     76     //
     77     setupInputStream(input);
     78 
     79     // Now we can set up the file name
     80     //
     81     input->istream->streamName	= input->strFactory->newStr8(input->strFactory, fileName);
     82     input->fileName		= input->istream->streamName;
     83 
     84     return input;
     85 }
     86 
     87 
     88 ANTLR3_API pANTLR3_INPUT_STREAM
     89 antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
     90 {
     91     pANTLR3_INPUT_STREAM    input;
     92 
     93     // First order of business is to set up the stream and install the data pointer.
     94     // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
     95     // default 8Bit stream accordingly.
     96     //
     97     input   = antlr3CreateStringStream(data);
     98     if  (input == NULL)
     99     {
    100         return NULL;
    101     }
    102 
    103     // Size (in bytes) of the given 'string'
    104     //
    105     input->sizeBuf		= size;
    106 
    107     // We have the data in memory now so we can deal with it according to
    108     // the encoding scheme we were given by the user.
    109     //
    110     input->encoding = encoding;
    111 
    112     // Now we need to work out the endian type and install any
    113     // API functions that differ from 8Bit
    114     //
    115     setupInputStream(input);
    116 
    117     // Now we can set up the file name
    118     //
    119     input->istream->streamName	= input->strFactory->newStr8(input->strFactory, name);
    120     input->fileName		= input->istream->streamName;
    121 
    122     return input;
    123 }
    124 
    125 
    126 /// Determine endianess of the input stream and install the
    127 /// API required for the encoding in that format.
    128 ///
    129 static void
    130 setupInputStream(pANTLR3_INPUT_STREAM input)
    131 {
    132     ANTLR3_BOOLEAN  isBigEndian;
    133 
    134     // Used to determine the endianness of the machine we are currently
    135     // running on.
    136     //
    137     ANTLR3_UINT16 bomTest = 0xFEFF;
    138 
    139     // What endianess is the machine we are running on? If the incoming
    140     // encoding endianess is the same as this machine's natural byte order
    141     // then we can use more efficient API calls.
    142     //
    143     if  (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
    144     {
    145         isBigEndian = ANTLR3_TRUE;
    146     }
    147     else
    148     {
    149         isBigEndian = ANTLR3_FALSE;
    150     }
    151 
    152     // What encoding did the user tell us {s}he thought it was? I am going
    153     // to get sick of the questions on antlr-interest, I know I am.
    154     //
    155     switch  (input->encoding)
    156     {
    157         case    ANTLR3_ENC_UTF8:
    158 
    159             // See if there is a BOM at the start of this UTF-8 sequence
    160             // and just eat it if there is. Windows .TXT files have this for instance
    161             // as it identifies UTF-8 even though it is of no consequence for byte order
    162             // as UTF-8 does not have a byte order.
    163             //
    164             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xEF
    165                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xBB
    166                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xBF
    167                 )
    168             {
    169                 // The UTF8 BOM is present so skip it
    170                 //
    171                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
    172             }
    173 
    174             // Install the UTF8 input routines
    175             //
    176             antlr3UTF8SetupStream(input);
    177             break;
    178 
    179         case    ANTLR3_ENC_UTF16:
    180 
    181             // See if there is a BOM at the start of the input. If not then
    182             // we assume that the byte order is the natural order of this
    183             // machine (or it is really UCS2). If there is a BOM we determine if the encoding
    184             // is the same as the natural order of this machine.
    185             //
    186             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFE
    187                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFF
    188                 )
    189             {
    190                 // BOM Present, indicates Big Endian
    191                 //
    192                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
    193 
    194                 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
    195             }
    196             else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
    197                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
    198                 )
    199             {
    200                 // BOM present, indicates Little Endian
    201                 //
    202                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
    203 
    204                 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
    205             }
    206             else
    207             {
    208                 // No BOM present, assume local computer byte order
    209                 //
    210                 antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
    211             }
    212             break;
    213 
    214         case    ANTLR3_ENC_UTF32:
    215 
    216             // See if there is a BOM at the start of the input. If not then
    217             // we assume that the byte order is the natural order of this
    218             // machine. If there is we determine if the encoding
    219             // is the same as the natural order of this machine.
    220             //
    221             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0x00
    222                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
    223                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xFE
    224                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3))    == 0xFF
    225                 )
    226             {
    227                 // BOM Present, indicates Big Endian
    228                 //
    229                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
    230 
    231                 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
    232             }
    233             else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
    234                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
    235                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
    236                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
    237                 )
    238             {
    239                 // BOM present, indicates Little Endian
    240                 //
    241                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
    242 
    243                 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
    244             }
    245             else
    246             {
    247                 // No BOM present, assume local computer byte order
    248                 //
    249                 antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
    250             }
    251             break;
    252 
    253         case    ANTLR3_ENC_UTF16BE:
    254 
    255             // Encoding is definately Big Endian with no BOM
    256             //
    257             antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
    258             break;
    259 
    260         case    ANTLR3_ENC_UTF16LE:
    261 
    262             // Encoding is definately Little Endian with no BOM
    263             //
    264             antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
    265             break;
    266 
    267         case    ANTLR3_ENC_UTF32BE:
    268 
    269             // Encoding is definately Big Endian with no BOM
    270             //
    271             antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
    272             break;
    273 
    274         case    ANTLR3_ENC_UTF32LE:
    275 
    276             // Encoding is definately Little Endian with no BOM
    277             //
    278             antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
    279             break;
    280 
    281         case    ANTLR3_ENC_EBCDIC:
    282 
    283             // EBCDIC is basically the same as ASCII but with an on the
    284             // fly translation to ASCII
    285             //
    286             antlr3EBCDICSetupStream(input);
    287             break;
    288 
    289         case    ANTLR3_ENC_8BIT:
    290         default:
    291 
    292             // Standard 8bit/ASCII
    293             //
    294             antlr38BitSetupStream(input);
    295             break;
    296     }
    297 }
    298 
    299 /** \brief Use the contents of an operating system file as the input
    300  *         for an input stream.
    301  *
    302  * \param fileName Name of operating system file to read.
    303  * \return
    304  *	- Pointer to new input stream context upon success
    305  *	- One of the ANTLR3_ERR_ defines on error.
    306  */
    307 static pANTLR3_INPUT_STREAM
    308 antlr3CreateFileStream(pANTLR3_UINT8 fileName)
    309 {
    310 	// Pointer to the input stream we are going to create
    311 	//
    312 	pANTLR3_INPUT_STREAM    input;
    313 	ANTLR3_UINT32	    status;
    314 
    315 	if	(fileName == NULL)
    316 	{
    317 		return NULL;
    318 	}
    319 
    320 	// Allocate memory for the input stream structure
    321 	//
    322 	input   = (pANTLR3_INPUT_STREAM)
    323 		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
    324 
    325 	if	(input == NULL)
    326 	{
    327 		return	NULL;
    328 	}
    329 
    330 	// Structure was allocated correctly, now we can read the file.
    331 	//
    332 	status  = antlr3read8Bit(input, fileName);
    333 
    334 	// Call the common 8 bit input stream handler
    335 	// initialization.
    336 	//
    337 	antlr3GenericSetupStream(input);
    338 
    339         // However if the file was not there or something then we
    340         // need to close. Have to wait until here as we cannot call
    341         // close until the API is installed of course.
    342         //
    343 	if	(status != ANTLR3_SUCCESS)
    344 	{
    345 		input->close(input);
    346 		return	NULL;
    347 	}
    348 
    349 	return  input;
    350 }
    351 
    352 ANTLR3_API ANTLR3_UINT32
    353 antlr3read8Bit(pANTLR3_INPUT_STREAM    input, pANTLR3_UINT8 fileName)
    354 {
    355 	ANTLR3_FDSC	    infile;
    356 	ANTLR3_UINT32	    fSize;
    357 
    358 	/* Open the OS file in read binary mode
    359 	*/
    360 	infile  = antlr3Fopen(fileName, "rb");
    361 
    362 	/* Check that it was there
    363 	*/
    364 	if	(infile == NULL)
    365 	{
    366 		return	(ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
    367 	}
    368 
    369 	/* It was there, so we can read the bytes now
    370 	*/
    371 	fSize   = antlr3Fsize(fileName);	/* Size of input file	*/
    372 
    373 	/* Allocate buffer for this input set
    374 	*/
    375 	input->data	    = ANTLR3_MALLOC((size_t)fSize);
    376 	input->sizeBuf  = fSize;
    377 
    378 	if	(input->data == NULL)
    379 	{
    380 		return	(ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
    381 	}
    382 
    383 	input->isAllocated	= ANTLR3_TRUE;
    384 
    385 	/* Now we read the file. Characters are not converted to
    386 	* the internal ANTLR encoding until they are read from the buffer
    387 	*/
    388 	antlr3Fread(infile, fSize, input->data);
    389 
    390 	/* And close the file handle
    391 	*/
    392 	antlr3Fclose(infile);
    393 
    394 	return  ANTLR3_SUCCESS;
    395 }
    396 
    397 /** \brief Open an operating system file and return the descriptor
    398  * We just use the common open() and related functions here.
    399  * Later we might find better ways on systems
    400  * such as Windows and OpenVMS for instance. But the idea is to read the
    401  * while file at once anyway, so it may be irrelevant.
    402  */
    403 ANTLR3_API ANTLR3_FDSC
    404 antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
    405 {
    406     return  (ANTLR3_FDSC)fopen((const char *)filename, mode);
    407 }
    408 
    409 /** \brief Close an operating system file and free any handles
    410  *  etc.
    411  */
    412 ANTLR3_API void
    413 antlr3Fclose(ANTLR3_FDSC fd)
    414 {
    415     fclose(fd);
    416 }
    417 ANTLR3_API ANTLR3_UINT32
    418 antlr3Fsize(pANTLR3_UINT8 fileName)
    419 {
    420     struct _stat	statbuf;
    421 
    422     _stat((const char *)fileName, &statbuf);
    423 
    424     return (ANTLR3_UINT32)statbuf.st_size;
    425 }
    426 
    427 ANTLR3_API ANTLR3_UINT32
    428 antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count,  void * data)
    429 {
    430     return  (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
    431 }
    432 
    433 
    434 /** \brief Use the supplied 'string' as input to the stream
    435  *
    436  * \param data Pointer to the input data
    437  * \return
    438  *	- Pointer to new input stream context upon success
    439  *	- NULL defines on error.
    440  */
    441 static pANTLR3_INPUT_STREAM
    442 antlr3CreateStringStream(pANTLR3_UINT8 data)
    443 {
    444 	// Pointer to the input stream we are going to create
    445 	//
    446 	pANTLR3_INPUT_STREAM    input;
    447 
    448 	if	(data == NULL)
    449 	{
    450 		return NULL;
    451 	}
    452 
    453 	// Allocate memory for the input stream structure
    454 	//
    455 	input   = (pANTLR3_INPUT_STREAM)
    456 		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
    457 
    458 	if	(input == NULL)
    459 	{
    460 		return	NULL;
    461 	}
    462 
    463 	// Structure was allocated correctly, now we can install the pointer
    464 	//
    465         input->data             = data;
    466         input->isAllocated	= ANTLR3_FALSE;
    467 
    468 	// Call the common 8 bit input stream handler
    469 	// initialization.
    470 	//
    471 	antlr3GenericSetupStream(input);
    472 
    473         return  input;
    474 }