1 /// \file 2 /// Base functions to initialize and manipulate any input stream 3 /// 4 5 // [The "BSD licence"] 6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 7 // http://www.temporal-wave.com 8 // http://www.linkedin.com/in/jimidle 9 // 10 // All rights reserved. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions 14 // are met: 15 // 1. Redistributions of source code must retain the above copyright 16 // notice, this list of conditions and the following disclaimer. 17 // 2. Redistributions in binary form must reproduce the above copyright 18 // notice, this list of conditions and the following disclaimer in the 19 // documentation and/or other materials provided with the distribution. 20 // 3. The name of the author may not be used to endorse or promote products 21 // derived from this software without specific prior written permission. 22 // 23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 34 #include <antlr3input.h> 35 36 // ----------------------------------- 37 // Generic 8 bit input such as latin-1 38 // 39 40 // 8Bit INT Stream API 41 // 42 static void antlr38BitConsume (pANTLR3_INT_STREAM is); 43 static ANTLR3_UCHAR antlr38BitLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 44 static ANTLR3_UCHAR antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 45 static ANTLR3_MARKER antlr38BitIndex (pANTLR3_INT_STREAM is); 46 static ANTLR3_MARKER antlr38BitMark (pANTLR3_INT_STREAM is); 47 static void antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark); 48 static void antlr38BitRewindLast (pANTLR3_INT_STREAM is); 49 static void antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark); 50 static void antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint); 51 static pANTLR3_STRING antlr38BitGetSourceName (pANTLR3_INT_STREAM is); 52 53 // 8Bit Charstream API functions 54 // 55 static void antlr3InputClose (pANTLR3_INPUT_STREAM input); 56 static void antlr3InputReset (pANTLR3_INPUT_STREAM input); 57 static void antlr38BitReuse (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name); 58 static void * antlr38BitLT (pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt); 59 static ANTLR3_UINT32 antlr38BitSize (pANTLR3_INPUT_STREAM input); 60 static pANTLR3_STRING antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop); 61 static ANTLR3_UINT32 antlr38BitGetLine (pANTLR3_INPUT_STREAM input); 62 static void * antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input); 63 static ANTLR3_UINT32 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input); 64 static void antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line); 65 static void antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position); 66 static void antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar); 67 static void antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag); 68 69 // ----------------------------------- 70 // UTF16 (also covers UCS2) 71 // 72 // INT Stream API 73 // 74 static void antlr3UTF16Consume (pANTLR3_INT_STREAM is); 75 static ANTLR3_UCHAR antlr3UTF16LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 76 static void antlr3UTF16ConsumeLE (pANTLR3_INT_STREAM is); 77 static ANTLR3_UCHAR antlr3UTF16LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 78 static void antlr3UTF16ConsumeBE (pANTLR3_INT_STREAM is); 79 static ANTLR3_UCHAR antlr3UTF16LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 80 static ANTLR3_MARKER antlr3UTF16Index (pANTLR3_INT_STREAM is); 81 static void antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint); 82 83 // UTF16 Charstream API functions 84 // 85 static pANTLR3_STRING antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop); 86 87 // ----------------------------------- 88 // UTF32 (also covers UCS2) 89 // 90 // INT Stream API 91 // 92 static void antlr3UTF32Consume (pANTLR3_INT_STREAM is); 93 static ANTLR3_UCHAR antlr3UTF32LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 94 static ANTLR3_UCHAR antlr3UTF32LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 95 static ANTLR3_UCHAR antlr3UTF32LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 96 static ANTLR3_MARKER antlr3UTF32Index (pANTLR3_INT_STREAM is); 97 static void antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint); 98 99 // UTF16 Charstream API functions 100 // 101 static pANTLR3_STRING antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop); 102 103 // ------------------------------------ 104 // UTF-8 105 // 106 static void antlr3UTF8Consume (pANTLR3_INT_STREAM is); 107 static ANTLR3_UCHAR antlr3UTF8LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 108 109 // ------------------------------------ 110 // EBCDIC 111 // 112 static ANTLR3_UCHAR antlr3EBCDICLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la); 113 114 /// \brief Common function to setup function interface for an 8 bit input stream. 115 /// 116 /// \param input Input stream context pointer 117 /// 118 /// \remark 119 /// - Many of the 8 bit oriented file stream handling functions will be usable 120 /// by any or at least some, other input streams. Therefore it is perfectly acceptable 121 /// to call this function to install the 8Bit handler then override just those functions 122 /// that would not work for the particular input encoding, such as consume for instance. 123 /// 124 void 125 antlr38BitSetupStream (pANTLR3_INPUT_STREAM input) 126 { 127 // Build a string factory for this stream 128 // 129 input->strFactory = antlr3StringFactoryNew(input->encoding); 130 131 // Default stream API set up is for 8Bit, so we are done 132 // 133 } 134 135 void 136 antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input) 137 { 138 /* Install function pointers for an 8 bit input 139 */ 140 141 /* Allocate stream interface 142 */ 143 input->istream = antlr3IntStreamNew(); 144 input->istream->type = ANTLR3_CHARSTREAM; 145 input->istream->super = input; 146 147 /* Intstream API 148 */ 149 input->istream->consume = antlr38BitConsume; // Consume the next 8 bit character in the buffer 150 input->istream->_LA = antlr38BitLA; // Return the UTF32 character at offset n (1 based) 151 input->istream->index = antlr38BitIndex; // Current index (offset from first character 152 input->istream->mark = antlr38BitMark; // Record the current lex state for later restore 153 input->istream->rewind = antlr38BitRewind; // How to rewind the input 154 input->istream->rewindLast = antlr38BitRewindLast; // How to rewind the input 155 input->istream->seek = antlr38BitSeek; // How to seek to a specific point in the stream 156 input->istream->release = antlr38BitRelease; // Reset marks after mark n 157 input->istream->getSourceName = antlr38BitGetSourceName; // Return a string that names the input source 158 159 /* Charstream API 160 */ 161 input->close = antlr3InputClose; // Close down the stream completely 162 input->free = antlr3InputClose; // Synonym for free 163 input->reset = antlr3InputReset; // Reset input to start 164 input->reuse = antlr38BitReuse; // Install a new input string and reset 165 input->_LT = antlr38BitLT; // Same as _LA for 8 bit file 166 input->size = antlr38BitSize; // Return the size of the input buffer 167 input->substr = antlr38BitSubstr; // Return a string from the input stream 168 input->getLine = antlr38BitGetLine; // Return the current line number in the input stream 169 input->getLineBuf = antlr38BitGetLineBuf; // Return a pointer to the start of the current line being consumed 170 input->getCharPositionInLine = antlr38BitGetCharPosition; // Return the offset into the current line of input 171 input->setLine = antlr38BitSetLine; // Set the input stream line number (does not set buffer pointers) 172 input->setCharPositionInLine = antlr38BitSetCharPosition; // Set the offset in to the current line (does not set any pointers) 173 input->SetNewLineChar = antlr38BitSetNewLineChar; // Set the value of the newline trigger character 174 input->setUcaseLA = antlr38BitSetUcaseLA; // Changes the LA function to return upper case always 175 176 input->charByteSize = 1; // Size in bytes of characters in this stream. 177 178 /* Initialize entries for tables etc 179 */ 180 input->markers = NULL; 181 182 /* Set up the input stream brand new 183 */ 184 input->reset(input); 185 186 /* Install default line separator character (it can be replaced 187 * by the grammar programmer later) 188 */ 189 input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n'); 190 } 191 192 static pANTLR3_STRING 193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is) 194 { 195 return is->streamName; 196 } 197 198 /** \brief Close down an input stream and free any memory allocated by it. 199 * 200 * \param input Input stream context pointer 201 */ 202 static void 203 antlr3InputClose(pANTLR3_INPUT_STREAM input) 204 { 205 // Close any markers in the input stream 206 // 207 if (input->markers != NULL) 208 { 209 input->markers->free(input->markers); 210 input->markers = NULL; 211 } 212 213 // Close the string factory 214 // 215 if (input->strFactory != NULL) 216 { 217 input->strFactory->close(input->strFactory); 218 } 219 220 // Free the input stream buffer if we allocated it 221 // 222 if (input->isAllocated && input->data != NULL) 223 { 224 ANTLR3_FREE(input->data); 225 input->data = NULL; 226 } 227 228 input->istream->free(input->istream); 229 230 // Finally, free the space for the structure itself 231 // 232 ANTLR3_FREE(input); 233 234 // Done 235 // 236 } 237 238 static void 239 antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag) 240 { 241 if (flag) 242 { 243 // Return the upper case version of the characters 244 // 245 input->istream->_LA = antlr38BitLA_ucase; 246 } 247 else 248 { 249 // Return the raw characters as they are in the buffer 250 // 251 input->istream->_LA = antlr38BitLA; 252 } 253 } 254 255 256 /** \brief Reset a re-startable input stream to the start 257 * 258 * \param input Input stream context pointer 259 */ 260 static void 261 antlr3InputReset(pANTLR3_INPUT_STREAM input) 262 { 263 264 input->nextChar = input->data; /* Input at first character */ 265 input->line = 1; /* starts at line 1 */ 266 input->charPositionInLine = -1; 267 input->currentLine = input->data; 268 input->markDepth = 0; /* Reset markers */ 269 270 /* Clear out up the markers table if it is there 271 */ 272 if (input->markers != NULL) 273 { 274 input->markers->clear(input->markers); 275 } 276 else 277 { 278 /* Install a new markers table 279 */ 280 input->markers = antlr3VectorNew(0); 281 } 282 } 283 284 /** Install a new source code in to a working input stream so that the 285 * input stream can be reused. 286 */ 287 static void 288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name) 289 { 290 input->isAllocated = ANTLR3_FALSE; 291 input->data = inString; 292 input->sizeBuf = size; 293 294 // Now we can set up the file name. As we are reusing the stream, there may already 295 // be a string that we can reuse for holding the filename. 296 // 297 if (input->istream->streamName == NULL) 298 { 299 input->istream->streamName = input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name); 300 input->fileName = input->istream->streamName; 301 } 302 else 303 { 304 input->istream->streamName->set(input->istream->streamName, (name == NULL ? (const char *)"-memory-" : (const char *)name)); 305 } 306 307 input->reset(input); 308 } 309 310 /** \brief Consume the next character in an 8 bit input stream 311 * 312 * \param input Input stream context pointer 313 */ 314 static void 315 antlr38BitConsume(pANTLR3_INT_STREAM is) 316 { 317 pANTLR3_INPUT_STREAM input; 318 319 input = ((pANTLR3_INPUT_STREAM) (is->super)); 320 321 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 322 { 323 /* Indicate one more character in this line 324 */ 325 input->charPositionInLine++; 326 327 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar) 328 { 329 /* Reset for start of a new line of input 330 */ 331 input->line++; 332 input->charPositionInLine = 0; 333 input->currentLine = (void *)(((pANTLR3_UINT8)input->nextChar) + 1); 334 } 335 336 /* Increment to next character position 337 */ 338 input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1); 339 } 340 } 341 342 /** \brief Return the input element assuming an 8 bit ascii input 343 * 344 * \param[in] input Input stream context pointer 345 * \param[in] la 1 based offset of next input stream element 346 * 347 * \return Next input character in internal ANTLR3 encoding (UTF32) 348 */ 349 static ANTLR3_UCHAR 350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 351 { 352 pANTLR3_INPUT_STREAM input; 353 354 input = ((pANTLR3_INPUT_STREAM) (is->super)); 355 356 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 357 { 358 return ANTLR3_CHARSTREAM_EOF; 359 } 360 else 361 { 362 return (ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1)); 363 } 364 } 365 366 /** \brief Return the input element assuming an 8 bit input and 367 * always return the UPPER CASE character. 368 * Note that this is 8 bit and so we assume that the toupper 369 * function will use the correct locale for 8 bits. 370 * 371 * \param[in] input Input stream context pointer 372 * \param[in] la 1 based offset of next input stream element 373 * 374 * \return Next input character in internal ANTLR3 encoding (UTF32) 375 */ 376 static ANTLR3_UCHAR 377 antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 378 { 379 pANTLR3_INPUT_STREAM input; 380 381 input = ((pANTLR3_INPUT_STREAM) (is->super)); 382 383 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 384 { 385 return ANTLR3_CHARSTREAM_EOF; 386 } 387 else 388 { 389 return (ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1))); 390 } 391 } 392 393 394 /** \brief Return the input element assuming an 8 bit ascii input 395 * 396 * \param[in] input Input stream context pointer 397 * \param[in] lt 1 based offset of next input stream element 398 * 399 * \return Next input character in internal ANTLR3 encoding (UTF32) 400 */ 401 static void * 402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt) 403 { 404 /* Casting is horrible but it means no warnings and LT should never be called 405 * on a character stream anyway I think. If it is then, the void * will need to be 406 * cast back in a similar manner. Yuck! But this means that LT for Token streams and 407 * tree streams is correct. 408 */ 409 return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt))); 410 } 411 412 /** \brief Calculate the current index in the output stream. 413 * \param[in] input Input stream context pointer 414 */ 415 static ANTLR3_MARKER 416 antlr38BitIndex(pANTLR3_INT_STREAM is) 417 { 418 pANTLR3_INPUT_STREAM input; 419 420 input = ((pANTLR3_INPUT_STREAM) (is->super)); 421 422 return (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar)); 423 } 424 425 /** \brief Return the size of the current input stream, as an 8Bit file 426 * which in this case is the total input. Other implementations may provide 427 * more sophisticated implementations to deal with non-recoverable streams 428 * and so on. 429 * 430 * \param[in] input Input stream context pointer 431 */ 432 static ANTLR3_UINT32 433 antlr38BitSize(pANTLR3_INPUT_STREAM input) 434 { 435 return input->sizeBuf; 436 } 437 438 /** \brief Mark the current input point in an 8Bit 8 bit stream 439 * such as a file stream, where all the input is available in the 440 * buffer. 441 * 442 * \param[in] is Input stream context pointer 443 */ 444 static ANTLR3_MARKER 445 antlr38BitMark (pANTLR3_INT_STREAM is) 446 { 447 pANTLR3_LEX_STATE state; 448 pANTLR3_INPUT_STREAM input; 449 450 input = ((pANTLR3_INPUT_STREAM) (is->super)); 451 452 /* New mark point 453 */ 454 input->markDepth++; 455 456 /* See if we are revisiting a mark as we can just reuse the vector 457 * entry if we are, otherwise, we need a new one 458 */ 459 if (input->markDepth > input->markers->count) 460 { 461 state = ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE)); 462 463 /* Add it to the table 464 */ 465 input->markers->add(input->markers, state, ANTLR3_FREE_FUNC); /* No special structure, just free() on delete */ 466 } 467 else 468 { 469 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1); 470 471 /* Assume no errors for speed, it will just blow up if the table failed 472 * for some reasons, hence lots of unit tests on the tables ;-) 473 */ 474 } 475 476 /* We have created or retrieved the state, so update it with the current 477 * elements of the lexer state. 478 */ 479 state->charPositionInLine = input->charPositionInLine; 480 state->currentLine = input->currentLine; 481 state->line = input->line; 482 state->nextChar = input->nextChar; 483 484 is->lastMarker = input->markDepth; 485 486 /* And that's it 487 */ 488 return input->markDepth; 489 } 490 /** \brief Rewind the lexer input to the state specified by the last produced mark. 491 * 492 * \param[in] input Input stream context pointer 493 * 494 * \remark 495 * Assumes 8 Bit input stream. 496 */ 497 static void 498 antlr38BitRewindLast (pANTLR3_INT_STREAM is) 499 { 500 is->rewind(is, is->lastMarker); 501 } 502 503 /** \brief Rewind the lexer input to the state specified by the supplied mark. 504 * 505 * \param[in] input Input stream context pointer 506 * 507 * \remark 508 * Assumes 8 Bit input stream. 509 */ 510 static void 511 antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark) 512 { 513 pANTLR3_LEX_STATE state; 514 pANTLR3_INPUT_STREAM input; 515 516 input = ((pANTLR3_INPUT_STREAM) is->super); 517 518 /* Perform any clean up of the marks 519 */ 520 input->istream->release(input->istream, mark); 521 522 /* Find the supplied mark state 523 */ 524 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1)); 525 526 /* Seek input pointer to the requested point (note we supply the void *pointer 527 * to whatever is implementing the int stream to seek). 528 */ 529 antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar)); 530 531 /* Reset to the reset of the information in the mark 532 */ 533 input->charPositionInLine = state->charPositionInLine; 534 input->currentLine = state->currentLine; 535 input->line = state->line; 536 input->nextChar = state->nextChar; 537 538 /* And we are done 539 */ 540 } 541 542 /** \brief Rewind the lexer input to the state specified by the supplied mark. 543 * 544 * \param[in] input Input stream context pointer 545 * 546 * \remark 547 * Assumes 8 Bit input stream. 548 */ 549 static void 550 antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark) 551 { 552 pANTLR3_INPUT_STREAM input; 553 554 input = ((pANTLR3_INPUT_STREAM) (is->super)); 555 556 /* We don't do much here in fact as we never free any higher marks in 557 * the hashtable as we just resuse any memory allocated for them. 558 */ 559 input->markDepth = (ANTLR3_UINT32)(mark - 1); 560 } 561 562 /** \brief Rewind the lexer input to the state specified by the supplied mark. 563 * 564 * \param[in] input Input stream context pointer 565 * 566 * \remark 567 * Assumes 8 Bit input stream. 568 */ 569 static void 570 antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint) 571 { 572 ANTLR3_INT32 count; 573 pANTLR3_INPUT_STREAM input; 574 575 input = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super)); 576 577 /* If the requested seek point is less than the current 578 * input point, then we assume that we are resetting from a mark 579 * and do not need to scan, but can just set to there. 580 */ 581 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar)) 582 { 583 input->nextChar = ((pANTLR3_UINT8) seekPoint); 584 } 585 else 586 { 587 count = (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar)); 588 589 while (count--) 590 { 591 is->consume(is); 592 } 593 } 594 } 595 /** Return a substring of the 8 bit input stream in 596 * newly allocated memory. 597 * 598 * \param input Input stream context pointer 599 * \param start Offset in input stream where the string starts 600 * \param stop Offset in the input stream where the string ends. 601 */ 602 static pANTLR3_STRING 603 antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop) 604 { 605 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1)); 606 } 607 608 /** \brief Return the line number as understood by the 8 bit input stream. 609 * 610 * \param input Input stream context pointer 611 * \return Line number in input stream that we believe we are working on. 612 */ 613 static ANTLR3_UINT32 614 antlr38BitGetLine (pANTLR3_INPUT_STREAM input) 615 { 616 return input->line; 617 } 618 619 /** Return a pointer into the input stream that points at the start 620 * of the current input line as triggered by the end of line character installed 621 * for the stream ('\n' unless told differently). 622 * 623 * \param[in] input 624 */ 625 static void * 626 antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input) 627 { 628 return input->currentLine; 629 } 630 631 /** Return the current offset in to the current line in the input stream. 632 * 633 * \param input Input stream context pointer 634 * \return Current line offset 635 */ 636 static ANTLR3_UINT32 637 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input) 638 { 639 return input->charPositionInLine; 640 } 641 642 /** Set the current line number as understood by the input stream. 643 * 644 * \param input Input stream context pointer 645 * \param line Line number to tell the input stream we are on 646 * 647 * \remark 648 * This function does not change any pointers, it just allows the programmer to set the 649 * line number according to some external criterion, such as finding a lexed directive 650 * like: #nnn "file.c" for instance, such that error reporting and so on in is in sync 651 * with some original source format. 652 */ 653 static void 654 antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line) 655 { 656 input->line = line; 657 } 658 659 /** Set the current offset in the current line to be a particular setting. 660 * 661 * \param[in] input Input stream context pointer 662 * \param[in] position New setting for current offset. 663 * 664 * \remark 665 * This does not set the actual pointers in the input stream, it is purely for reporting 666 * purposes and so on as per antlr38BitSetLine(); 667 */ 668 static void 669 antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position) 670 { 671 input->charPositionInLine = position; 672 } 673 674 /** Set the newline trigger character in the input stream to the supplied parameter. 675 * 676 * \param[in] input Input stream context pointer 677 * \param[in] newlineChar Character to set to be the newline trigger. 678 * 679 * \remark 680 * - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc 681 * are the same encodings), but the input stream catered to by this function is 8 bit 682 * only, so it is up to the programmer to ensure that the character supplied is valid. 683 */ 684 static void 685 antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar) 686 { 687 input->newlineChar = newlineChar; 688 } 689 690 691 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream. 692 /// 693 /// \param input Input stream context pointer 694 /// 695 /// \remark 696 /// - Strictly speaking, there is no such thing as a UCS2 input stream as the term 697 /// tends to confuse the notions of character encoding, unicode and so on. UCS2 is 698 /// essentially UTF16 without any surrogates and so the standard UTF16 699 /// input stream is able to handle it without any special code. 700 /// 701 void 702 antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian) 703 { 704 // Build a string factory for this stream. This is a UTF16 string factory which is a standard 705 // part of the ANTLR3 string. The string factory is then passed through the whole chain 706 // of lexer->parser->tree->treeparser and so on. 707 // 708 input->strFactory = antlr3StringFactoryNew(input->encoding); 709 710 // Generic API that does not care about endianess. 711 // 712 input->istream->index = antlr3UTF16Index; // Calculate current index in input stream, UTF16 based 713 input->substr = antlr3UTF16Substr; // Return a string from the input stream 714 input->istream->seek = antlr3UTF16Seek; // How to seek to a specific point in the stream 715 716 // We must install different UTF16 routines according to whether the input 717 // is the same endianess as the machine we are executing upon or not. If it is not 718 // then we must install methods that can convert the endianess on the fly as they go 719 // 720 721 switch (machineBigEndian) 722 { 723 case ANTLR3_TRUE: 724 725 // Machine is Big Endian, if the input is also then install the 726 // methods that do not access input by bytes and reverse them. 727 // Otherwise install endian aware methods. 728 // 729 if (inputBigEndian == ANTLR3_TRUE) 730 { 731 // Input is machine compatible 732 // 733 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer 734 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based) 735 } 736 else 737 { 738 // Need to use methods that know that the input is little endian 739 // 740 input->istream->consume = antlr3UTF16ConsumeLE; // Consume the next UTF16 character in the buffer 741 input->istream->_LA = antlr3UTF16LALE; // Return the UTF32 character at offset n (1 based) 742 } 743 break; 744 745 case ANTLR3_FALSE: 746 747 // Machine is Little Endian, if the input is also then install the 748 // methods that do not access input by bytes and reverse them. 749 // Otherwise install endian aware methods. 750 // 751 if (inputBigEndian == ANTLR3_FALSE) 752 { 753 // Input is machine compatible 754 // 755 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer 756 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based) 757 } 758 else 759 { 760 // Need to use methods that know that the input is Big Endian 761 // 762 input->istream->consume = antlr3UTF16ConsumeBE; // Consume the next UTF16 character in the buffer 763 input->istream->_LA = antlr3UTF16LABE; // Return the UTF32 character at offset n (1 based) 764 } 765 break; 766 } 767 768 769 input->charByteSize = 2; // Size in bytes of characters in this stream. 770 771 } 772 773 /// \brief Consume the next character in a UTF16 input stream 774 /// 775 /// \param input Input stream context pointer 776 /// 777 static void 778 antlr3UTF16Consume(pANTLR3_INT_STREAM is) 779 { 780 pANTLR3_INPUT_STREAM input; 781 UTF32 ch; 782 UTF32 ch2; 783 784 input = ((pANTLR3_INPUT_STREAM) (is->super)); 785 786 // Buffer size is always in bytes 787 // 788 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 789 { 790 // Indicate one more character in this line 791 // 792 input->charPositionInLine++; 793 794 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar) 795 { 796 // Reset for start of a new line of input 797 // 798 input->line++; 799 input->charPositionInLine = 0; 800 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 801 } 802 803 // Increment to next character position, accounting for any surrogates 804 // 805 // Next char in natural machine byte order 806 // 807 ch = *((UTF16*)input->nextChar); 808 809 // We consumed one 16 bit character 810 // 811 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 812 813 // If we have a surrogate pair then we need to consume 814 // a following valid LO surrogate. 815 // 816 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 817 818 // If the 16 bits following the high surrogate are in the source buffer... 819 // 820 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 821 { 822 // Next character is in natural machine byte order 823 // 824 ch2 = *((UTF16*)input->nextChar); 825 826 // If it's a valid low surrogate, consume it 827 // 828 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 829 { 830 // We consumed one 16 bit character 831 // 832 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 833 } 834 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 835 // it. 836 // 837 } 838 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 839 // it because the buffer ended 840 // 841 } 842 // Note that we did not check for an invalid low surrogate here, or that fact that the 843 // lo surrogate was missing. We just picked out one 16 bit character unless the character 844 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 845 // 846 } 847 } 848 849 /// \brief Return the input element assuming an 8 bit ascii input 850 /// 851 /// \param[in] input Input stream context pointer 852 /// \param[in] la 1 based offset of next input stream element 853 /// 854 /// \return Next input character in internal ANTLR3 encoding (UTF32) 855 /// 856 static ANTLR3_UCHAR 857 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 858 { 859 pANTLR3_INPUT_STREAM input; 860 UTF32 ch; 861 UTF32 ch2; 862 UTF16 * nextChar; 863 864 // Find the input interface and where we are currently pointing to 865 // in the input stream 866 // 867 input = ((pANTLR3_INPUT_STREAM) (is->super)); 868 nextChar = input->nextChar; 869 870 // If a positive offset then advance forward, else retreat 871 // 872 if (la >= 0) 873 { 874 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf ) 875 { 876 // Advance our copy of the input pointer 877 // 878 // Next char in natural machine byte order 879 // 880 ch = *nextChar++; 881 882 // If we have a surrogate pair then we need to consume 883 // a following valid LO surrogate. 884 // 885 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 886 { 887 // If the 16 bits following the high surrogate are in the source buffer... 888 // 889 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 890 { 891 // Next character is in natural machine byte order 892 // 893 ch2 = *nextChar; 894 895 // If it's a valid low surrogate, consume it 896 // 897 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 898 { 899 // We consumed one 16 bit character 900 // 901 nextChar++; 902 } 903 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 904 // it. 905 // 906 } 907 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 908 // it because the buffer ended 909 // 910 } 911 // Note that we did not check for an invalid low surrogate here, or that fact that the 912 // lo surrogate was missing. We just picked out one 16 bit character unless the character 913 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 914 // 915 } 916 } 917 else 918 { 919 // We need to go backwards from our input point 920 // 921 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data ) 922 { 923 // Get the previous 16 bit character 924 // 925 ch = *--nextChar; 926 927 // If we found a low surrogate then go back one more character if 928 // the hi surrogate is there 929 // 930 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 931 { 932 ch2 = *(nextChar-1); 933 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 934 { 935 // Yes, there is a high surrogate to match it so decrement one more and point to that 936 // 937 nextChar--; 938 } 939 } 940 } 941 } 942 943 // Our local copy of nextChar is now pointing to either the correct character or end of file 944 // 945 // Input buffer size is always in bytes 946 // 947 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 948 { 949 return ANTLR3_CHARSTREAM_EOF; 950 } 951 else 952 { 953 // Pick up the next 16 character (native machine byte order) 954 // 955 ch = *nextChar++; 956 957 // If we have a surrogate pair then we need to consume 958 // a following valid LO surrogate. 959 // 960 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 961 { 962 // If the 16 bits following the high surrogate are in the source buffer... 963 // 964 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 965 { 966 // Next character is in natural machine byte order 967 // 968 ch2 = *nextChar; 969 970 // If it's a valid low surrogate, consume it 971 // 972 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 973 { 974 // Construct the UTF32 code point 975 // 976 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 977 + (ch2 - UNI_SUR_LOW_START) + halfBase; 978 } 979 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 980 // it. 981 // 982 } 983 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 984 // it because the buffer ended 985 // 986 } 987 } 988 return ch; 989 } 990 991 992 /// \brief Calculate the current index in the output stream. 993 /// \param[in] input Input stream context pointer 994 /// 995 static ANTLR3_MARKER 996 antlr3UTF16Index(pANTLR3_INT_STREAM is) 997 { 998 pANTLR3_INPUT_STREAM input; 999 1000 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1001 1002 return (ANTLR3_MARKER)(input->nextChar); 1003 } 1004 1005 /// \brief Rewind the lexer input to the state specified by the supplied mark. 1006 /// 1007 /// \param[in] input Input stream context pointer 1008 /// 1009 /// \remark 1010 /// Assumes UTF16 input stream. 1011 /// 1012 static void 1013 antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint) 1014 { 1015 pANTLR3_INPUT_STREAM input; 1016 1017 input = ((pANTLR3_INPUT_STREAM) is->super); 1018 1019 // If the requested seek point is less than the current 1020 // input point, then we assume that we are resetting from a mark 1021 // and do not need to scan, but can just set to there as rewind will 1022 // reset line numbers and so on. 1023 // 1024 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar)) 1025 { 1026 input->nextChar = (void *)seekPoint; 1027 } 1028 else 1029 { 1030 // Call consume until we reach the asked for seek point or EOF 1031 // 1032 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar) 1033 { 1034 is->consume(is); 1035 } 1036 } 1037 } 1038 /// \brief Return a substring of the UTF16 input stream in 1039 /// newly allocated memory. 1040 /// 1041 /// \param input Input stream context pointer 1042 /// \param start Offset in input stream where the string starts 1043 /// \param stop Offset in the input stream where the string ends. 1044 /// 1045 static pANTLR3_STRING 1046 antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop) 1047 { 1048 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1); 1049 } 1050 1051 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not 1052 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance 1053 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we 1054 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream 1055 /// is fubar but we just ignore that. 1056 /// 1057 /// \param input Input stream context pointer 1058 /// 1059 static void 1060 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is) 1061 { 1062 pANTLR3_INPUT_STREAM input; 1063 UTF32 ch; 1064 UTF32 ch2; 1065 1066 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1067 1068 // Buffer size is always in bytes 1069 // 1070 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1071 { 1072 // Indicate one more character in this line 1073 // 1074 input->charPositionInLine++; 1075 1076 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar) 1077 { 1078 // Reset for start of a new line of input 1079 // 1080 input->line++; 1081 input->charPositionInLine = 0; 1082 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1083 } 1084 1085 // Increment to next character position, accounting for any surrogates 1086 // 1087 // Next char in litle endian form 1088 // 1089 ch = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8); 1090 1091 // We consumed one 16 bit character 1092 // 1093 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1094 1095 // If we have a surrogate pair then we need to consume 1096 // a following valid LO surrogate. 1097 // 1098 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1099 1100 // If the 16 bits following the high surrogate are in the source buffer... 1101 // 1102 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1103 { 1104 ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8); 1105 1106 // If it's a valid low surrogate, consume it 1107 // 1108 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1109 { 1110 // We consumed one 16 bit character 1111 // 1112 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1113 } 1114 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1115 // it. 1116 // 1117 } 1118 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1119 // it because the buffer ended 1120 // 1121 } 1122 // Note that we did not check for an invalid low surrogate here, or that fact that the 1123 // lo surrogate was missing. We just picked out one 16 bit character unless the character 1124 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 1125 // 1126 } 1127 } 1128 1129 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not 1130 /// 1131 /// \param[in] input Input stream context pointer 1132 /// \param[in] la 1 based offset of next input stream element 1133 /// 1134 /// \return Next input character in internal ANTLR3 encoding (UTF32) 1135 /// 1136 static ANTLR3_UCHAR 1137 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1138 { 1139 pANTLR3_INPUT_STREAM input; 1140 UTF32 ch; 1141 UTF32 ch2; 1142 pANTLR3_UCHAR nextChar; 1143 1144 // Find the input interface and where we are currently pointing to 1145 // in the input stream 1146 // 1147 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1148 nextChar = input->nextChar; 1149 1150 // If a positive offset then advance forward, else retreat 1151 // 1152 if (la >= 0) 1153 { 1154 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf ) 1155 { 1156 // Advance our copy of the input pointer 1157 // 1158 // Next char in Little Endian byte order 1159 // 1160 ch = (*nextChar) + (*(nextChar+1) << 8); 1161 nextChar += 2; 1162 1163 // If we have a surrogate pair then we need to consume 1164 // a following valid LO surrogate. 1165 // 1166 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 1167 { 1168 // If the 16 bits following the high surrogate are in the source buffer... 1169 // 1170 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1171 { 1172 // Next character is in little endian byte order 1173 // 1174 ch2 = (*nextChar) + (*(nextChar+1) << 8); 1175 1176 // If it's a valid low surrogate, consume it 1177 // 1178 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1179 { 1180 // We consumed one 16 bit character 1181 // 1182 nextChar += 2; 1183 } 1184 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1185 // it. 1186 // 1187 } 1188 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1189 // it because the buffer ended 1190 // 1191 } 1192 // Note that we did not check for an invalid low surrogate here, or that fact that the 1193 // lo surrogate was missing. We just picked out one 16 bit character unless the character 1194 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 1195 // 1196 } 1197 } 1198 else 1199 { 1200 // We need to go backwards from our input point 1201 // 1202 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data ) 1203 { 1204 // Get the previous 16 bit character 1205 // 1206 ch = (*nextChar - 2) + ((*nextChar -1) << 8); 1207 nextChar -= 2; 1208 1209 // If we found a low surrogate then go back one more character if 1210 // the hi surrogate is there 1211 // 1212 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 1213 { 1214 ch2 = (*nextChar - 2) + ((*nextChar -1) << 8); 1215 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 1216 { 1217 // Yes, there is a high surrogate to match it so decrement one more and point to that 1218 // 1219 nextChar -=2; 1220 } 1221 } 1222 } 1223 } 1224 1225 // Our local copy of nextChar is now pointing to either the correct character or end of file 1226 // 1227 // Input buffer size is always in bytes 1228 // 1229 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1230 { 1231 return ANTLR3_CHARSTREAM_EOF; 1232 } 1233 else 1234 { 1235 // Pick up the next 16 character (little endian byte order) 1236 // 1237 ch = (*nextChar) + (*(nextChar+1) << 8); 1238 nextChar += 2; 1239 1240 // If we have a surrogate pair then we need to consume 1241 // a following valid LO surrogate. 1242 // 1243 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 1244 { 1245 // If the 16 bits following the high surrogate are in the source buffer... 1246 // 1247 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1248 { 1249 // Next character is in little endian byte order 1250 // 1251 ch2 = (*nextChar) + (*(nextChar+1) << 8); 1252 1253 // If it's a valid low surrogate, consume it 1254 // 1255 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1256 { 1257 // Construct the UTF32 code point 1258 // 1259 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1260 + (ch2 - UNI_SUR_LOW_START) + halfBase; 1261 } 1262 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1263 // it. 1264 // 1265 } 1266 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1267 // it because the buffer ended 1268 // 1269 } 1270 } 1271 return ch; 1272 } 1273 1274 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not 1275 /// 1276 /// \param input Input stream context pointer 1277 /// 1278 static void 1279 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is) 1280 { 1281 pANTLR3_INPUT_STREAM input; 1282 UTF32 ch; 1283 UTF32 ch2; 1284 1285 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1286 1287 // Buffer size is always in bytes 1288 // 1289 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1290 { 1291 // Indicate one more character in this line 1292 // 1293 input->charPositionInLine++; 1294 1295 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar) 1296 { 1297 // Reset for start of a new line of input 1298 // 1299 input->line++; 1300 input->charPositionInLine = 0; 1301 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1302 } 1303 1304 // Increment to next character position, accounting for any surrogates 1305 // 1306 // Next char in big endian form 1307 // 1308 ch = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8); 1309 1310 // We consumed one 16 bit character 1311 // 1312 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1313 1314 // If we have a surrogate pair then we need to consume 1315 // a following valid LO surrogate. 1316 // 1317 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1318 1319 // If the 16 bits following the high surrogate are in the source buffer... 1320 // 1321 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1322 { 1323 // Big endian 1324 // 1325 ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8); 1326 1327 // If it's a valid low surrogate, consume it 1328 // 1329 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1330 { 1331 // We consumed one 16 bit character 1332 // 1333 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1); 1334 } 1335 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1336 // it. 1337 // 1338 } 1339 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1340 // it because the buffer ended 1341 // 1342 } 1343 // Note that we did not check for an invalid low surrogate here, or that fact that the 1344 // lo surrogate was missing. We just picked out one 16 bit character unless the character 1345 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 1346 // 1347 } 1348 } 1349 1350 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not 1351 /// 1352 /// \param[in] input Input stream context pointer 1353 /// \param[in] la 1 based offset of next input stream element 1354 /// 1355 /// \return Next input character in internal ANTLR3 encoding (UTF32) 1356 /// 1357 static ANTLR3_UCHAR 1358 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1359 { 1360 pANTLR3_INPUT_STREAM input; 1361 UTF32 ch; 1362 UTF32 ch2; 1363 pANTLR3_UCHAR nextChar; 1364 1365 // Find the input interface and where we are currently pointing to 1366 // in the input stream 1367 // 1368 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1369 nextChar = input->nextChar; 1370 1371 // If a positive offset then advance forward, else retreat 1372 // 1373 if (la >= 0) 1374 { 1375 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf ) 1376 { 1377 // Advance our copy of the input pointer 1378 // 1379 // Next char in Big Endian byte order 1380 // 1381 ch = ((*nextChar) << 8) + *(nextChar+1); 1382 nextChar += 2; 1383 1384 // If we have a surrogate pair then we need to consume 1385 // a following valid LO surrogate. 1386 // 1387 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 1388 { 1389 // If the 16 bits following the high surrogate are in the source buffer... 1390 // 1391 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1392 { 1393 // Next character is in big endian byte order 1394 // 1395 ch2 = ((*nextChar) << 8) + *(nextChar+1); 1396 1397 // If it's a valid low surrogate, consume it 1398 // 1399 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1400 { 1401 // We consumed one 16 bit character 1402 // 1403 nextChar += 2; 1404 } 1405 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1406 // it. 1407 // 1408 } 1409 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1410 // it because the buffer ended 1411 // 1412 } 1413 // Note that we did not check for an invalid low surrogate here, or that fact that the 1414 // lo surrogate was missing. We just picked out one 16 bit character unless the character 1415 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 1416 // 1417 } 1418 } 1419 else 1420 { 1421 // We need to go backwards from our input point 1422 // 1423 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data ) 1424 { 1425 // Get the previous 16 bit character 1426 // 1427 ch = ((*nextChar - 2) << 8) + (*nextChar -1); 1428 nextChar -= 2; 1429 1430 // If we found a low surrogate then go back one more character if 1431 // the hi surrogate is there 1432 // 1433 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 1434 { 1435 ch2 = ((*nextChar - 2) << 8) + (*nextChar -1); 1436 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 1437 { 1438 // Yes, there is a high surrogate to match it so decrement one more and point to that 1439 // 1440 nextChar -=2; 1441 } 1442 } 1443 } 1444 } 1445 1446 // Our local copy of nextChar is now pointing to either the correct character or end of file 1447 // 1448 // Input buffer size is always in bytes 1449 // 1450 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1451 { 1452 return ANTLR3_CHARSTREAM_EOF; 1453 } 1454 else 1455 { 1456 // Pick up the next 16 character (big endian byte order) 1457 // 1458 ch = ((*nextChar) << 8) + *(nextChar+1); 1459 nextChar += 2; 1460 1461 // If we have a surrogate pair then we need to consume 1462 // a following valid LO surrogate. 1463 // 1464 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 1465 { 1466 // If the 16 bits following the high surrogate are in the source buffer... 1467 // 1468 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1469 { 1470 // Next character is in big endian byte order 1471 // 1472 ch2 = ((*nextChar) << 8) + *(nextChar+1); 1473 1474 // If it's a valid low surrogate, consume it 1475 // 1476 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1477 { 1478 // Construct the UTF32 code point 1479 // 1480 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1481 + (ch2 - UNI_SUR_LOW_START) + halfBase; 1482 } 1483 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1484 // it. 1485 // 1486 } 1487 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1488 // it because the buffer ended 1489 // 1490 } 1491 } 1492 return ch; 1493 } 1494 1495 /// \brief Common function to setup function interface for a UTF3 input stream. 1496 /// 1497 /// \param input Input stream context pointer 1498 /// 1499 void 1500 antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian) 1501 { 1502 // Build a string factory for this stream. This is a UTF32 string factory which is a standard 1503 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser 1504 // and so on. 1505 // 1506 input->strFactory = antlr3StringFactoryNew(input->encoding); 1507 1508 // Generic API that does not care about endianess. 1509 // 1510 input->istream->index = antlr3UTF32Index; // Calculate current index in input stream, UTF16 based 1511 input->substr = antlr3UTF32Substr; // Return a string from the input stream 1512 input->istream->seek = antlr3UTF32Seek; // How to seek to a specific point in the stream 1513 input->istream->consume = antlr3UTF32Consume; // Consume the next UTF32 character in the buffer 1514 1515 // We must install different UTF32 LA routines according to whether the input 1516 // is the same endianess as the machine we are executing upon or not. If it is not 1517 // then we must install methods that can convert the endianess on the fly as they go 1518 // 1519 switch (machineBigEndian) 1520 { 1521 case ANTLR3_TRUE: 1522 1523 // Machine is Big Endian, if the input is also then install the 1524 // methods that do not access input by bytes and reverse them. 1525 // Otherwise install endian aware methods. 1526 // 1527 if (inputBigEndian == ANTLR3_TRUE) 1528 { 1529 // Input is machine compatible 1530 // 1531 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based) 1532 } 1533 else 1534 { 1535 // Need to use methods that know that the input is little endian 1536 // 1537 input->istream->_LA = antlr3UTF32LALE; // Return the UTF32 character at offset n (1 based) 1538 } 1539 break; 1540 1541 case ANTLR3_FALSE: 1542 1543 // Machine is Little Endian, if the input is also then install the 1544 // methods that do not access input by bytes and reverse them. 1545 // Otherwise install endian aware methods. 1546 // 1547 if (inputBigEndian == ANTLR3_FALSE) 1548 { 1549 // Input is machine compatible 1550 // 1551 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based) 1552 } 1553 else 1554 { 1555 // Need to use methods that know that the input is Big Endian 1556 // 1557 input->istream->_LA = antlr3UTF32LABE; // Return the UTF32 character at offset n (1 based) 1558 } 1559 break; 1560 } 1561 1562 input->charByteSize = 4; // Size in bytes of characters in this stream. 1563 } 1564 1565 /** \brief Consume the next character in a UTF32 input stream 1566 * 1567 * \param input Input stream context pointer 1568 */ 1569 static void 1570 antlr3UTF32Consume(pANTLR3_INT_STREAM is) 1571 { 1572 pANTLR3_INPUT_STREAM input; 1573 1574 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1575 1576 // SizeBuf is always in bytes 1577 // 1578 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1579 { 1580 /* Indicate one more character in this line 1581 */ 1582 input->charPositionInLine++; 1583 1584 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar) 1585 { 1586 /* Reset for start of a new line of input 1587 */ 1588 input->line++; 1589 input->charPositionInLine = 0; 1590 input->currentLine = (void *)(((pANTLR3_UINT32)input->nextChar) + 1); 1591 } 1592 1593 /* Increment to next character position 1594 */ 1595 input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1); 1596 } 1597 } 1598 1599 /// \brief Calculate the current index in the output stream. 1600 /// \param[in] input Input stream context pointer 1601 /// 1602 static ANTLR3_MARKER 1603 antlr3UTF32Index(pANTLR3_INT_STREAM is) 1604 { 1605 pANTLR3_INPUT_STREAM input; 1606 1607 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1608 1609 return (ANTLR3_MARKER)(input->nextChar); 1610 } 1611 1612 /// \brief Return a substring of the UTF16 input stream in 1613 /// newly allocated memory. 1614 /// 1615 /// \param input Input stream context pointer 1616 /// \param start Offset in input stream where the string starts 1617 /// \param stop Offset in the input stream where the string ends. 1618 /// 1619 static pANTLR3_STRING 1620 antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop) 1621 { 1622 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1); 1623 } 1624 1625 /// \brief Rewind the lexer input to the state specified by the supplied mark. 1626 /// 1627 /// \param[in] input Input stream context pointer 1628 /// 1629 /// \remark 1630 /// Assumes UTF32 input stream. 1631 /// 1632 static void 1633 antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint) 1634 { 1635 pANTLR3_INPUT_STREAM input; 1636 1637 input = ((pANTLR3_INPUT_STREAM) is->super); 1638 1639 // If the requested seek point is less than the current 1640 // input point, then we assume that we are resetting from a mark 1641 // and do not need to scan, but can just set to there as rewind will 1642 // reset line numbers and so on. 1643 // 1644 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar)) 1645 { 1646 input->nextChar = (void *)seekPoint; 1647 } 1648 else 1649 { 1650 // Call consume until we reach the asked for seek point or EOF 1651 // 1652 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar) 1653 { 1654 is->consume(is); 1655 } 1656 } 1657 } 1658 1659 /** \brief Return the input element assuming a UTF32 input in natural machine byte order 1660 * 1661 * \param[in] input Input stream context pointer 1662 * \param[in] la 1 based offset of next input stream element 1663 * 1664 * \return Next input character in internal ANTLR3 encoding (UTF32) 1665 */ 1666 static ANTLR3_UCHAR 1667 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1668 { 1669 pANTLR3_INPUT_STREAM input; 1670 1671 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1672 1673 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1674 { 1675 return ANTLR3_CHARSTREAM_EOF; 1676 } 1677 else 1678 { 1679 return (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1)); 1680 } 1681 } 1682 1683 /** \brief Return the input element assuming a UTF32 input in little endian byte order 1684 * 1685 * \param[in] input Input stream context pointer 1686 * \param[in] la 1 based offset of next input stream element 1687 * 1688 * \return Next input character in internal ANTLR3 encoding (UTF32) 1689 */ 1690 static ANTLR3_UCHAR 1691 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1692 { 1693 pANTLR3_INPUT_STREAM input; 1694 1695 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1696 1697 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1698 { 1699 return ANTLR3_CHARSTREAM_EOF; 1700 } 1701 else 1702 { 1703 ANTLR3_UCHAR c; 1704 1705 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1)); 1706 1707 // Swap Endianess to Big Endian 1708 // 1709 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24); 1710 } 1711 } 1712 1713 /** \brief Return the input element assuming a UTF32 input in big endian byte order 1714 * 1715 * \param[in] input Input stream context pointer 1716 * \param[in] la 1 based offset of next input stream element 1717 * 1718 * \return Next input character in internal ANTLR3 encoding (UTF32) 1719 * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap 1720 */ 1721 static ANTLR3_UCHAR 1722 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1723 { 1724 pANTLR3_INPUT_STREAM input; 1725 1726 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1727 1728 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1729 { 1730 return ANTLR3_CHARSTREAM_EOF; 1731 } 1732 else 1733 { 1734 ANTLR3_UCHAR c; 1735 1736 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1)); 1737 1738 // Swap Endianess to Little Endian 1739 // 1740 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24); 1741 } 1742 } 1743 1744 1745 /// \brief Common function to setup function interface for a UTF8 input stream. 1746 /// 1747 /// \param input Input stream context pointer 1748 /// 1749 void 1750 antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input) 1751 { 1752 // Build a string factory for this stream. This is a UTF16 string factory which is a standard 1753 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser 1754 // and so on. 1755 // 1756 input->strFactory = antlr3StringFactoryNew(input->encoding); 1757 1758 // Generic API that does not care about endianess. 1759 // 1760 input->istream->consume = antlr3UTF8Consume; // Consume the next UTF32 character in the buffer 1761 input->istream->_LA = antlr3UTF8LA; // Return the UTF32 character at offset n (1 based) 1762 input->charByteSize = 0; // Size in bytes of characters in this stream. 1763 } 1764 1765 // ------------------------------------------------------ 1766 // Following is from Unicode.org (see antlr3convertutf.c) 1767 // 1768 1769 /// Index into the table below with the first byte of a UTF-8 sequence to 1770 /// get the number of trailing bytes that are supposed to follow it. 1771 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 1772 /// left as-is for anyone who may want to do such conversion, which was 1773 /// allowed in earlier algorithms. 1774 /// 1775 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = { 1776 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1777 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1778 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1779 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1780 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1781 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1782 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1783 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 1784 }; 1785 1786 /// Magic values subtracted from a buffer value during UTF8 conversion. 1787 /// This table contains as many values as there might be trailing bytes 1788 /// in a UTF-8 sequence. 1789 /// 1790 static const UTF32 offsetsFromUTF8[6] = 1791 { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 1792 0x03C82080UL, 0xFA082080UL, 0x82082080UL 1793 }; 1794 1795 // End of Unicode.org tables 1796 // ------------------------- 1797 1798 1799 /** \brief Consume the next character in a UTF8 input stream 1800 * 1801 * \param input Input stream context pointer 1802 */ 1803 static void 1804 antlr3UTF8Consume(pANTLR3_INT_STREAM is) 1805 { 1806 pANTLR3_INPUT_STREAM input; 1807 ANTLR3_UINT32 extraBytesToRead; 1808 ANTLR3_UCHAR ch; 1809 pANTLR3_UINT8 nextChar; 1810 1811 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1812 1813 nextChar = input->nextChar; 1814 1815 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1816 { 1817 // Indicate one more character in this line 1818 // 1819 input->charPositionInLine++; 1820 1821 // Are there more bytes needed to make up the whole thing? 1822 // 1823 extraBytesToRead = trailingBytesForUTF8[*nextChar]; 1824 1825 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1826 { 1827 input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf); 1828 return; 1829 } 1830 1831 // Cases deliberately fall through (see note A in antlrconvertutf.c) 1832 // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so 1833 // we allow it. 1834 // 1835 ch = 0; 1836 switch (extraBytesToRead) { 1837 case 5: ch += *nextChar++; ch <<= 6; 1838 case 4: ch += *nextChar++; ch <<= 6; 1839 case 3: ch += *nextChar++; ch <<= 6; 1840 case 2: ch += *nextChar++; ch <<= 6; 1841 case 1: ch += *nextChar++; ch <<= 6; 1842 case 0: ch += *nextChar++; 1843 } 1844 1845 // Magically correct the input value 1846 // 1847 ch -= offsetsFromUTF8[extraBytesToRead]; 1848 if (ch == input->newlineChar) 1849 { 1850 /* Reset for start of a new line of input 1851 */ 1852 input->line++; 1853 input->charPositionInLine = 0; 1854 input->currentLine = (void *)nextChar; 1855 } 1856 1857 // Update input pointer 1858 // 1859 input->nextChar = nextChar; 1860 } 1861 } 1862 /** \brief Return the input element assuming a UTF8 input 1863 * 1864 * \param[in] input Input stream context pointer 1865 * \param[in] la 1 based offset of next input stream element 1866 * 1867 * \return Next input character in internal ANTLR3 encoding (UTF32) 1868 */ 1869 static ANTLR3_UCHAR 1870 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 1871 { 1872 pANTLR3_INPUT_STREAM input; 1873 ANTLR3_UINT32 extraBytesToRead; 1874 ANTLR3_UCHAR ch; 1875 pANTLR3_UINT8 nextChar; 1876 1877 input = ((pANTLR3_INPUT_STREAM) (is->super)); 1878 1879 nextChar = input->nextChar; 1880 1881 // Do we need to traverse forwards or backwards? 1882 // - LA(0) is treated as LA(1) and we assume that the nextChar is 1883 // already positioned. 1884 // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding 1885 // - LA(-n) means we must traverse backwards n chracters 1886 // 1887 if (la > 1) { 1888 1889 // Make sure that we have at least one character left before trying to 1890 // loop through the buffer. 1891 // 1892 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1893 { 1894 // Now traverse n-1 characters forward 1895 // 1896 while (--la > 0) 1897 { 1898 // Does the next character require trailing bytes? 1899 // If so advance the pointer by that many bytes as well as advancing 1900 // one position for what will be at least a single byte character. 1901 // 1902 nextChar += trailingBytesForUTF8[*nextChar] + 1; 1903 1904 // Does that calculation take us past the byte length of the buffer? 1905 // 1906 if (nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1907 { 1908 return ANTLR3_CHARSTREAM_EOF; 1909 } 1910 } 1911 } 1912 else 1913 { 1914 return ANTLR3_CHARSTREAM_EOF; 1915 } 1916 } 1917 else 1918 { 1919 // LA is negative so we decrease the pointer by n character positions 1920 // 1921 while (nextChar > (pANTLR3_UINT8)input->data && la++ < 0) 1922 { 1923 // Traversing backwards in UTF8 means decermenting by one 1924 // then continuing to decrement while ever a character pattern 1925 // is flagged as being a trailing byte of an encoded code point. 1926 // Trailing UTF8 bytes always start with 10 in binary. We assumne that 1927 // the UTF8 is well formed and do not check boundary conditions 1928 // 1929 nextChar--; 1930 while ((*nextChar & 0xC0) == 0x80) 1931 { 1932 nextChar--; 1933 } 1934 } 1935 } 1936 1937 // nextChar is now pointing at the UTF8 encoded character that we need to 1938 // decode and return. 1939 // 1940 // Are there more bytes needed to make up the whole thing? 1941 // 1942 extraBytesToRead = trailingBytesForUTF8[*nextChar]; 1943 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 1944 { 1945 return ANTLR3_CHARSTREAM_EOF; 1946 } 1947 1948 // Cases deliberately fall through (see note A in antlrconvertutf.c) 1949 // 1950 ch = 0; 1951 switch (extraBytesToRead) { 1952 case 5: ch += *nextChar++; ch <<= 6; 1953 case 4: ch += *nextChar++; ch <<= 6; 1954 case 3: ch += *nextChar++; ch <<= 6; 1955 case 2: ch += *nextChar++; ch <<= 6; 1956 case 1: ch += *nextChar++; ch <<= 6; 1957 case 0: ch += *nextChar++; 1958 } 1959 1960 // Magically correct the input value 1961 // 1962 ch -= offsetsFromUTF8[extraBytesToRead]; 1963 1964 return ch; 1965 } 1966 1967 // EBCDIC to ASCII conversion table 1968 // 1969 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX 1970 // translation and the character tables are published all over the interweb. 1971 // 1972 const ANTLR3_UCHAR e2a[256] = 1973 { 1974 0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f, 1975 0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1976 0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97, 1977 0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f, 1978 0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b, 1979 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07, 1980 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 1981 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a, 1982 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 1983 0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, 1984 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 1985 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f, 1986 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 1987 0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, 1988 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 1989 0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, 1990 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 1991 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1, 1992 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 1993 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, 1994 0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 1995 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae, 1996 0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 1997 0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7, 1998 0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 1999 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, 2000 0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 2001 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff, 2002 0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 2003 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, 2004 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 2005 0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e 2006 }; 2007 2008 /// \brief Common function to setup function interface for a EBCDIC input stream. 2009 /// 2010 /// \param input Input stream context pointer 2011 /// 2012 void 2013 antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input) 2014 { 2015 // EBCDIC streams can use the standard 8 bit string factory 2016 // 2017 input->strFactory = antlr3StringFactoryNew(input->encoding); 2018 2019 // Generic API that does not care about endianess. 2020 // 2021 input->istream->_LA = antlr3EBCDICLA; // Return the UTF32 character at offset n (1 based) 2022 input->charByteSize = 1; // Size in bytes of characters in this stream. 2023 } 2024 2025 /// \brief Return the input element assuming an 8 bit EBCDIC input 2026 /// 2027 /// \param[in] input Input stream context pointer 2028 /// \param[in] la 1 based offset of next input stream element 2029 /// 2030 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation 2031 /// from EBCDIC to ASCII 2032 /// 2033 static ANTLR3_UCHAR 2034 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) 2035 { 2036 pANTLR3_INPUT_STREAM input; 2037 2038 input = ((pANTLR3_INPUT_STREAM) (is->super)); 2039 2040 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf)) 2041 { 2042 return ANTLR3_CHARSTREAM_EOF; 2043 } 2044 else 2045 { 2046 // Translate the required character via the constant conversion table 2047 // 2048 return e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))]; 2049 } 2050 }