1 /** \file 2 * \brief The ANTLR3 C filestream is used when the source character stream 3 * is a filesystem based input set and all the characters in the filestream 4 * can be loaded at once into memory and away the lexer goes. 5 * 6 * A number of initializers are provided in order that various character 7 * sets can be supported from input files. The ANTLR3 C runtime expects 8 * to deal with UTF32 characters only (the reasons for this are to 9 * do with the simplification of C code when using this form of Unicode 10 * encoding, though this is not a panacea. More information can be 11 * found on this by consulting: 12 * - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178 13 * Where a well grounded discussion of the encoding formats available 14 * may be found. 15 * 16 */ 17 18 // [The "BSD licence"] 19 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 20 // http://www.temporal-wave.com 21 // http://www.linkedin.com/in/jimidle 22 // 23 // All rights reserved. 24 // 25 // Redistribution and use in source and binary forms, with or without 26 // modification, are permitted provided that the following conditions 27 // are met: 28 // 1. Redistributions of source code must retain the above copyright 29 // notice, this list of conditions and the following disclaimer. 30 // 2. Redistributions in binary form must reproduce the above copyright 31 // notice, this list of conditions and the following disclaimer in the 32 // documentation and/or other materials provided with the distribution. 33 // 3. The name of the author may not be used to endorse or promote products 34 // derived from this software without specific prior written permission. 35 // 36 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 37 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 38 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 39 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 40 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 41 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 42 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 43 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 44 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 45 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 47 #include <antlr3.h> 48 49 static void setupInputStream (pANTLR3_INPUT_STREAM input); 50 static pANTLR3_INPUT_STREAM antlr3CreateFileStream (pANTLR3_UINT8 fileName); 51 static pANTLR3_INPUT_STREAM antlr3CreateStringStream (pANTLR3_UINT8 data); 52 53 ANTLR3_API pANTLR3_INPUT_STREAM 54 antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding) 55 { 56 pANTLR3_INPUT_STREAM input; 57 58 // First order of business is to read the file into some buffer space 59 // as just straight 8 bit bytes. Then we will work out the encoding and 60 // byte order and adjust the API functions that are installed for the 61 // default 8Bit stream accordingly. 62 // 63 input = antlr3CreateFileStream(fileName); 64 if (input == NULL) 65 { 66 return NULL; 67 } 68 69 // We have the data in memory now so we can deal with it according to 70 // the encoding scheme we were given by the user. 71 // 72 input->encoding = encoding; 73 74 // Now we need to work out the endian type and install any 75 // API functions that differ from 8Bit 76 // 77 setupInputStream(input); 78 79 // Now we can set up the file name 80 // 81 input->istream->streamName = input->strFactory->newStr8(input->strFactory, fileName); 82 input->fileName = input->istream->streamName; 83 84 return input; 85 } 86 87 88 ANTLR3_API pANTLR3_INPUT_STREAM 89 antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name) 90 { 91 pANTLR3_INPUT_STREAM input; 92 93 // First order of business is to set up the stream and install the data pointer. 94 // Then we will work out the encoding and byte order and adjust the API functions that are installed for the 95 // default 8Bit stream accordingly. 96 // 97 input = antlr3CreateStringStream(data); 98 if (input == NULL) 99 { 100 return NULL; 101 } 102 103 // Size (in bytes) of the given 'string' 104 // 105 input->sizeBuf = size; 106 107 // We have the data in memory now so we can deal with it according to 108 // the encoding scheme we were given by the user. 109 // 110 input->encoding = encoding; 111 112 // Now we need to work out the endian type and install any 113 // API functions that differ from 8Bit 114 // 115 setupInputStream(input); 116 117 // Now we can set up the file name 118 // 119 input->istream->streamName = input->strFactory->newStr8(input->strFactory, name); 120 input->fileName = input->istream->streamName; 121 122 return input; 123 } 124 125 126 /// Determine endianess of the input stream and install the 127 /// API required for the encoding in that format. 128 /// 129 static void 130 setupInputStream(pANTLR3_INPUT_STREAM input) 131 { 132 ANTLR3_BOOLEAN isBigEndian; 133 134 // Used to determine the endianness of the machine we are currently 135 // running on. 136 // 137 ANTLR3_UINT16 bomTest = 0xFEFF; 138 139 // What endianess is the machine we are running on? If the incoming 140 // encoding endianess is the same as this machine's natural byte order 141 // then we can use more efficient API calls. 142 // 143 if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE) 144 { 145 isBigEndian = ANTLR3_TRUE; 146 } 147 else 148 { 149 isBigEndian = ANTLR3_FALSE; 150 } 151 152 // What encoding did the user tell us {s}he thought it was? I am going 153 // to get sick of the questions on antlr-interest, I know I am. 154 // 155 switch (input->encoding) 156 { 157 case ANTLR3_ENC_UTF8: 158 159 // See if there is a BOM at the start of this UTF-8 sequence 160 // and just eat it if there is. Windows .TXT files have this for instance 161 // as it identifies UTF-8 even though it is of no consequence for byte order 162 // as UTF-8 does not have a byte order. 163 // 164 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF 165 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB 166 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF 167 ) 168 { 169 // The UTF8 BOM is present so skip it 170 // 171 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3); 172 } 173 174 // Install the UTF8 input routines 175 // 176 antlr3UTF8SetupStream(input); 177 break; 178 179 case ANTLR3_ENC_UTF16: 180 181 // See if there is a BOM at the start of the input. If not then 182 // we assume that the byte order is the natural order of this 183 // machine (or it is really UCS2). If there is a BOM we determine if the encoding 184 // is the same as the natural order of this machine. 185 // 186 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE 187 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF 188 ) 189 { 190 // BOM Present, indicates Big Endian 191 // 192 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2); 193 194 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE); 195 } 196 else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF 197 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE 198 ) 199 { 200 // BOM present, indicates Little Endian 201 // 202 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2); 203 204 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE); 205 } 206 else 207 { 208 // No BOM present, assume local computer byte order 209 // 210 antlr3UTF16SetupStream(input, isBigEndian, isBigEndian); 211 } 212 break; 213 214 case ANTLR3_ENC_UTF32: 215 216 // See if there is a BOM at the start of the input. If not then 217 // we assume that the byte order is the natural order of this 218 // machine. If there is we determine if the encoding 219 // is the same as the natural order of this machine. 220 // 221 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00 222 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 223 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE 224 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF 225 ) 226 { 227 // BOM Present, indicates Big Endian 228 // 229 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4); 230 231 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE); 232 } 233 else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF 234 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE 235 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 236 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 237 ) 238 { 239 // BOM present, indicates Little Endian 240 // 241 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4); 242 243 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE); 244 } 245 else 246 { 247 // No BOM present, assume local computer byte order 248 // 249 antlr3UTF32SetupStream(input, isBigEndian, isBigEndian); 250 } 251 break; 252 253 case ANTLR3_ENC_UTF16BE: 254 255 // Encoding is definately Big Endian with no BOM 256 // 257 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE); 258 break; 259 260 case ANTLR3_ENC_UTF16LE: 261 262 // Encoding is definately Little Endian with no BOM 263 // 264 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE); 265 break; 266 267 case ANTLR3_ENC_UTF32BE: 268 269 // Encoding is definately Big Endian with no BOM 270 // 271 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE); 272 break; 273 274 case ANTLR3_ENC_UTF32LE: 275 276 // Encoding is definately Little Endian with no BOM 277 // 278 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE); 279 break; 280 281 case ANTLR3_ENC_EBCDIC: 282 283 // EBCDIC is basically the same as ASCII but with an on the 284 // fly translation to ASCII 285 // 286 antlr3EBCDICSetupStream(input); 287 break; 288 289 case ANTLR3_ENC_8BIT: 290 default: 291 292 // Standard 8bit/ASCII 293 // 294 antlr38BitSetupStream(input); 295 break; 296 } 297 } 298 299 /** \brief Use the contents of an operating system file as the input 300 * for an input stream. 301 * 302 * \param fileName Name of operating system file to read. 303 * \return 304 * - Pointer to new input stream context upon success 305 * - One of the ANTLR3_ERR_ defines on error. 306 */ 307 static pANTLR3_INPUT_STREAM 308 antlr3CreateFileStream(pANTLR3_UINT8 fileName) 309 { 310 // Pointer to the input stream we are going to create 311 // 312 pANTLR3_INPUT_STREAM input; 313 ANTLR3_UINT32 status; 314 315 if (fileName == NULL) 316 { 317 return NULL; 318 } 319 320 // Allocate memory for the input stream structure 321 // 322 input = (pANTLR3_INPUT_STREAM) 323 ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM)); 324 325 if (input == NULL) 326 { 327 return NULL; 328 } 329 330 // Structure was allocated correctly, now we can read the file. 331 // 332 status = antlr3read8Bit(input, fileName); 333 334 // Call the common 8 bit input stream handler 335 // initialization. 336 // 337 antlr3GenericSetupStream(input); 338 339 // However if the file was not there or something then we 340 // need to close. Have to wait until here as we cannot call 341 // close until the API is installed of course. 342 // 343 if (status != ANTLR3_SUCCESS) 344 { 345 input->close(input); 346 return NULL; 347 } 348 349 return input; 350 } 351 352 ANTLR3_API ANTLR3_UINT32 353 antlr3read8Bit(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 fileName) 354 { 355 ANTLR3_FDSC infile; 356 ANTLR3_UINT32 fSize; 357 358 /* Open the OS file in read binary mode 359 */ 360 infile = antlr3Fopen(fileName, "rb"); 361 362 /* Check that it was there 363 */ 364 if (infile == NULL) 365 { 366 return (ANTLR3_UINT32)ANTLR3_ERR_NOFILE; 367 } 368 369 /* It was there, so we can read the bytes now 370 */ 371 fSize = antlr3Fsize(fileName); /* Size of input file */ 372 373 /* Allocate buffer for this input set 374 */ 375 input->data = ANTLR3_MALLOC((size_t)fSize); 376 input->sizeBuf = fSize; 377 378 if (input->data == NULL) 379 { 380 return (ANTLR3_UINT32)ANTLR3_ERR_NOMEM; 381 } 382 383 input->isAllocated = ANTLR3_TRUE; 384 385 /* Now we read the file. Characters are not converted to 386 * the internal ANTLR encoding until they are read from the buffer 387 */ 388 antlr3Fread(infile, fSize, input->data); 389 390 /* And close the file handle 391 */ 392 antlr3Fclose(infile); 393 394 return ANTLR3_SUCCESS; 395 } 396 397 /** \brief Open an operating system file and return the descriptor 398 * We just use the common open() and related functions here. 399 * Later we might find better ways on systems 400 * such as Windows and OpenVMS for instance. But the idea is to read the 401 * while file at once anyway, so it may be irrelevant. 402 */ 403 ANTLR3_API ANTLR3_FDSC 404 antlr3Fopen(pANTLR3_UINT8 filename, const char * mode) 405 { 406 return (ANTLR3_FDSC)fopen((const char *)filename, mode); 407 } 408 409 /** \brief Close an operating system file and free any handles 410 * etc. 411 */ 412 ANTLR3_API void 413 antlr3Fclose(ANTLR3_FDSC fd) 414 { 415 fclose(fd); 416 } 417 ANTLR3_API ANTLR3_UINT32 418 antlr3Fsize(pANTLR3_UINT8 fileName) 419 { 420 struct _stat statbuf; 421 422 _stat((const char *)fileName, &statbuf); 423 424 return (ANTLR3_UINT32)statbuf.st_size; 425 } 426 427 ANTLR3_API ANTLR3_UINT32 428 antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count, void * data) 429 { 430 return (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc); 431 } 432 433 434 /** \brief Use the supplied 'string' as input to the stream 435 * 436 * \param data Pointer to the input data 437 * \return 438 * - Pointer to new input stream context upon success 439 * - NULL defines on error. 440 */ 441 static pANTLR3_INPUT_STREAM 442 antlr3CreateStringStream(pANTLR3_UINT8 data) 443 { 444 // Pointer to the input stream we are going to create 445 // 446 pANTLR3_INPUT_STREAM input; 447 448 if (data == NULL) 449 { 450 return NULL; 451 } 452 453 // Allocate memory for the input stream structure 454 // 455 input = (pANTLR3_INPUT_STREAM) 456 ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM)); 457 458 if (input == NULL) 459 { 460 return NULL; 461 } 462 463 // Structure was allocated correctly, now we can install the pointer 464 // 465 input->data = data; 466 input->isAllocated = ANTLR3_FALSE; 467 468 // Call the common 8 bit input stream handler 469 // initialization. 470 // 471 antlr3GenericSetupStream(input); 472 473 return input; 474 }