1 ANTLR_BEGIN_NAMESPACE() 2 3 template<class ImplTraits, class SuperType> 4 ANTLR_INLINE IntStream<ImplTraits, SuperType>::IntStream() 5 { 6 m_lastMarker = 0; 7 m_upper_case = false; 8 } 9 10 template<class ImplTraits, class SuperType> 11 ANTLR_INLINE typename IntStream<ImplTraits, SuperType>::StringType IntStream<ImplTraits, SuperType>::getSourceName() 12 { 13 return m_streamName; 14 } 15 16 template<class ImplTraits, class SuperType> 17 ANTLR_INLINE typename IntStream<ImplTraits, SuperType>::StringType& IntStream<ImplTraits, SuperType>::get_streamName() 18 { 19 return m_streamName; 20 } 21 22 template<class ImplTraits, class SuperType> 23 ANTLR_INLINE const typename IntStream<ImplTraits, SuperType>::StringType& IntStream<ImplTraits, SuperType>::get_streamName() const 24 { 25 return m_streamName; 26 } 27 28 template<class ImplTraits, class SuperType> 29 ANTLR_INLINE ANTLR_MARKER IntStream<ImplTraits, SuperType>::get_lastMarker() const 30 { 31 return m_lastMarker; 32 } 33 34 template<class ImplTraits, class SuperType> 35 ANTLR_INLINE void IntStream<ImplTraits, SuperType>::setUcaseLA(bool flag) 36 { 37 m_upper_case = flag; 38 } 39 40 template<class ImplTraits, class SuperType> 41 ANTLR_INLINE SuperType* IntStream<ImplTraits, SuperType>::get_super() 42 { 43 return static_cast<SuperType*>(this); 44 } 45 46 template<class ImplTraits, class SuperType> 47 void IntStream<ImplTraits, SuperType>::consume() 48 { 49 SuperType* input = this->get_super(); 50 51 const ANTLR_UINT8* nextChar = input->get_nextChar(); 52 const ANTLR_UINT8* data = input->get_data(); 53 ANTLR_UINT32 sizeBuf = input->get_sizeBuf(); 54 55 if ( nextChar < ( data + sizeBuf ) ) 56 { 57 /* Indicate one more character in this line 58 */ 59 input->inc_charPositionInLine(); 60 61 if ((ANTLR_UCHAR)(*(nextChar)) == input->get_newlineChar() ) 62 { 63 /* Reset for start of a new line of input 64 */ 65 input->inc_line(); 66 input->set_charPositionInLine(0); 67 input->set_currentLine(nextChar + 1); 68 } 69 70 /* Increment to next character position 71 */ 72 input->set_nextChar( nextChar + 1 ); 73 } 74 } 75 76 template<class ImplTraits, class SuperType> 77 ANTLR_UINT32 IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la ) 78 { 79 SuperType* input = this->get_super(); 80 const ANTLR_UINT8* nextChar = input->get_nextChar(); 81 const ANTLR_UINT8* data = input->get_data(); 82 ANTLR_UINT32 sizeBuf = input->get_sizeBuf(); 83 84 if (( nextChar + la - 1) >= (data + sizeBuf)) 85 { 86 return ANTLR_CHARSTREAM_EOF; 87 } 88 else 89 { 90 if( !m_upper_case ) 91 return (ANTLR_UCHAR)(*(nextChar + la - 1)); 92 else 93 return (ANTLR_UCHAR)toupper(*(nextChar + la - 1)); 94 } 95 } 96 97 template<class ImplTraits, class SuperType> 98 ANTLR_MARKER IntStream<ImplTraits, SuperType>::mark() 99 { 100 LexState<ImplTraits>* state; 101 SuperType* input = this->get_super(); 102 103 /* New mark point 104 */ 105 input->inc_markDepth(); 106 107 /* See if we are revisiting a mark as we can just reuse the vector 108 * entry if we are, otherwise, we need a new one 109 */ 110 if (input->get_markDepth() > input->get_markers().size() ) 111 { 112 input->get_markers().push_back( LexState<ImplTraits>() ); 113 LexState<ImplTraits>& state_r = input->get_markers().back(); 114 state = &state_r; 115 } 116 else 117 { 118 LexState<ImplTraits>& state_r = input->get_markers().at( input->get_markDepth() - 1 ); 119 state = &state_r; 120 121 /* Assume no errors for speed, it will just blow up if the table failed 122 * for some reasons, hence lots of unit tests on the tables ;-) 123 */ 124 } 125 126 /* We have created or retrieved the state, so update it with the current 127 * elements of the lexer state. 128 */ 129 state->set_charPositionInLine( input->get_charPositionInLine() ); 130 state->set_currentLine( input->get_currentLine() ); 131 state->set_line( input->get_line() ); 132 state->set_nextChar( input->get_nextChar() ); 133 134 m_lastMarker = input->get_markDepth(); 135 136 /* And that's it 137 */ 138 return input->get_markDepth(); 139 } 140 141 template<class ImplTraits, class SuperType> 142 ANTLR_MARKER IntStream<ImplTraits, SuperType>::index() 143 { 144 SuperType* input = this->get_super(); 145 return input->index_impl(); 146 } 147 148 template<class ImplTraits, class SuperType> 149 void IntStream<ImplTraits, SuperType>::rewind(ANTLR_MARKER mark) 150 { 151 SuperType* input = this->get_super(); 152 153 /* Perform any clean up of the marks 154 */ 155 this->release(mark); 156 157 /* Find the supplied mark state 158 */ 159 ANTLR_UINT32 idx = static_cast<ANTLR_UINT32>( mark-1 ); 160 typename ImplTraits::LexStateType& state = input->get_markers().at( idx ); 161 162 /* Seek input pointer to the requested point (note we supply the void *pointer 163 * to whatever is implementing the int stream to seek). 164 */ 165 this->seek( (ANTLR_MARKER)state.get_nextChar() ); 166 167 /* Reset to the reset of the information in the mark 168 */ 169 input->set_charPositionInLine( state.get_charPositionInLine() ); 170 input->set_currentLine( state.get_currentLine() ); 171 input->set_line( state.get_line() ); 172 input->set_nextChar( state.get_nextChar() ); 173 174 /* And we are done 175 */ 176 } 177 178 template<class ImplTraits, class SuperType> 179 void IntStream<ImplTraits, SuperType>::rewindLast() 180 { 181 this->rewind(m_lastMarker); 182 } 183 184 template<class ImplTraits, class SuperType> 185 void IntStream<ImplTraits, SuperType>::release(ANTLR_MARKER mark) 186 { 187 SuperType* input = this->get_super(); 188 189 /* We don't do much here in fact as we never free any higher marks in 190 * the hashtable as we just resuse any memory allocated for them. 191 */ 192 input->set_markDepth( (ANTLR_UINT32)(mark - 1) ); 193 194 } 195 196 template<class ImplTraits, class SuperType> 197 void IntStream<ImplTraits, SuperType>::setupIntStream(bool, bool) 198 { 199 } 200 201 template<class ImplTraits, class SuperType> 202 void IntStream<ImplTraits, SuperType>::seek(ANTLR_MARKER seekPoint) 203 { 204 ANTLR_INT32 count; 205 SuperType* input = this->get_super(); 206 207 ANTLR_MARKER nextChar = (ANTLR_MARKER) input->get_nextChar(); 208 /* If the requested seek point is less than the current 209 * input point, then we assume that we are resetting from a mark 210 * and do not need to scan, but can just set to there. 211 */ 212 if (seekPoint <= nextChar) 213 { 214 input->set_nextChar((ANTLR_UINT8*) seekPoint); 215 } 216 else 217 { 218 count = (ANTLR_UINT32)(seekPoint - nextChar); 219 220 while (count--) 221 { 222 this->consume(); 223 } 224 } 225 } 226 227 template<class ImplTraits, class SuperType> 228 IntStream<ImplTraits, SuperType>::~IntStream() 229 { 230 } 231 232 template<class ImplTraits, class SuperType> 233 ANTLR_UINT32 EBCDIC_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la) 234 { 235 // EBCDIC to ASCII conversion table 236 // 237 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX 238 // translation and the character tables are published all over the interweb. 239 // 240 const ANTLR_UCHAR e2a[256] = 241 { 242 0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f, 243 0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 244 0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97, 245 0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f, 246 0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b, 247 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07, 248 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 249 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a, 250 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 251 0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, 252 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 253 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f, 254 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 255 0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, 256 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 257 0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, 258 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 259 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1, 260 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 261 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, 262 0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 263 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae, 264 0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 265 0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7, 266 0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 267 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, 268 0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 269 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff, 270 0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 271 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, 272 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 273 0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e 274 }; 275 276 SuperType* input = this->get_super(); 277 278 if (( input->get_nextChar() + la - 1) >= ( input->get_data() + input->get_sizeBuf() )) 279 { 280 return ANTLR_CHARSTREAM_EOF; 281 } 282 else 283 { 284 // Translate the required character via the constant conversion table 285 // 286 return e2a[(*(input->get_nextChar() + la - 1))]; 287 } 288 } 289 290 template<class ImplTraits, class SuperType> 291 void EBCDIC_IntStream<ImplTraits, SuperType>::setupIntStream() 292 { 293 SuperType* super = this->get_super(); 294 super->set_charByteSize(1); 295 } 296 297 template<class ImplTraits, class SuperType> 298 ANTLR_UINT32 UTF16_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 i) 299 { 300 return this->_LA(i, ClassForwarder< typename ImplTraits::Endianness >() ); 301 } 302 303 template<class ImplTraits, class SuperType> 304 void UTF16_IntStream<ImplTraits, SuperType>::consume() 305 { 306 this->consume( ClassForwarder< typename ImplTraits::Endianness >() ); 307 } 308 309 template<class ImplTraits, class SuperType> 310 ANTLR_MARKER UTF16_IntStream<ImplTraits, SuperType>::index() 311 { 312 SuperType* input = this->get_super(); 313 return (ANTLR_MARKER)(input->get_nextChar()); 314 } 315 316 template<class ImplTraits, class SuperType> 317 void UTF16_IntStream<ImplTraits, SuperType>::seek(ANTLR_MARKER seekPoint) 318 { 319 SuperType* input = this->get_super(); 320 321 // If the requested seek point is less than the current 322 // input point, then we assume that we are resetting from a mark 323 // and do not need to scan, but can just set to there as rewind will 324 // reset line numbers and so on. 325 // 326 if (seekPoint <= (ANTLR_MARKER)(input->get_nextChar())) 327 { 328 input->set_nextChar( seekPoint ); 329 } 330 else 331 { 332 // Call consume until we reach the asked for seek point or EOF 333 // 334 while( (this->_LA(1) != ANTLR_CHARSTREAM_EOF) && (seekPoint < (ANTLR_MARKER)input->get_nextChar() ) ) 335 { 336 this->consume(); 337 } 338 } 339 } 340 341 template<class ImplTraits, class SuperType> 342 void IntStream<ImplTraits, SuperType>::findout_endian_spec(bool machineBigEndian, bool inputBigEndian) 343 { 344 // We must install different UTF16 routines according to whether the input 345 // is the same endianess as the machine we are executing upon or not. If it is not 346 // then we must install methods that can convert the endianess on the fly as they go 347 // 348 349 if(machineBigEndian == true) 350 { 351 // Machine is Big Endian, if the input is also then install the 352 // methods that do not access input by bytes and reverse them. 353 // Otherwise install endian aware methods. 354 // 355 if (inputBigEndian == true) 356 { 357 // Input is machine compatible 358 // 359 m_endian_spec = 1; 360 } 361 else 362 { 363 // Need to use methods that know that the input is little endian 364 // 365 m_endian_spec = 2; 366 } 367 } 368 else 369 { 370 // Machine is Little Endian, if the input is also then install the 371 // methods that do not access input by bytes and reverse them. 372 // Otherwise install endian aware methods. 373 // 374 if (inputBigEndian == false) 375 { 376 // Input is machine compatible 377 // 378 m_endian_spec = 1; 379 } 380 else 381 { 382 // Need to use methods that know that the input is Big Endian 383 // 384 m_endian_spec = 3; 385 } 386 } 387 } 388 389 template<class ImplTraits, class SuperType> 390 void UTF16_IntStream<ImplTraits, SuperType>::setupIntStream(bool machineBigEndian, bool inputBigEndian) 391 { 392 SuperType* super = this->get_super(); 393 super->set_charByteSize(2); 394 395 this->findout_endian_spec( machineBigEndian, inputBigEndian ); 396 } 397 398 template<class ImplTraits, class SuperType> 399 ANTLR_UINT32 IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ) 400 { 401 assert( (m_endian_spec >= 1) && (m_endian_spec <= 3)); 402 switch(m_endian_spec) 403 { 404 case 1: 405 return this->_LA(i, ClassForwarder<BYTE_AGNOSTIC>() ); 406 break; 407 case 2: 408 return this->_LA(i, ClassForwarder<ANTLR_LITTLE_ENDIAN>() ); 409 break; 410 case 3: 411 return this->_LA(i, ClassForwarder<ANTLR_BIG_ENDIAN>() ); 412 break; 413 default: 414 break; 415 } 416 return 0; 417 } 418 419 template<class ImplTraits, class SuperType> 420 void IntStream<ImplTraits, SuperType>::consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ) 421 { 422 assert( (m_endian_spec >= 1) && (m_endian_spec <= 3)); 423 switch(m_endian_spec) 424 { 425 case 1: 426 this->consume( ClassForwarder<BYTE_AGNOSTIC>() ); 427 break; 428 case 2: 429 this->consume( ClassForwarder<ANTLR_LITTLE_ENDIAN>() ); 430 break; 431 case 3: 432 this->consume( ClassForwarder<ANTLR_BIG_ENDIAN>() ); 433 break; 434 default: 435 break; 436 } 437 } 438 439 template<class ImplTraits, class SuperType> 440 ANTLR_UINT32 UTF16_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<BYTE_AGNOSTIC> ) 441 { 442 SuperType* input; 443 UTF32 ch; 444 UTF32 ch2; 445 UTF16* nextChar; 446 447 // Find the input interface and where we are currently pointing to 448 // in the input stream 449 // 450 input = this->get_super; 451 nextChar = input->get_nextChar(); 452 453 // If a positive offset then advance forward, else retreat 454 // 455 if (la >= 0) 456 { 457 while (--la > 0 && (ANTLR_UINT8*)nextChar < ((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() ) 458 { 459 // Advance our copy of the input pointer 460 // 461 // Next char in natural machine byte order 462 // 463 ch = *nextChar++; 464 465 // If we have a surrogate pair then we need to consume 466 // a following valid LO surrogate. 467 // 468 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 469 { 470 // If the 16 bits following the high surrogate are in the source buffer... 471 // 472 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() )) 473 { 474 // Next character is in natural machine byte order 475 // 476 ch2 = *nextChar; 477 478 // If it's a valid low surrogate, consume it 479 // 480 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 481 { 482 // We consumed one 16 bit character 483 // 484 nextChar++; 485 } 486 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 487 // it. 488 // 489 } 490 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 491 // it because the buffer ended 492 // 493 } 494 // Note that we did not check for an invalid low surrogate here, or that fact that the 495 // lo surrogate was missing. We just picked out one 16 bit character unless the character 496 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 497 // 498 } 499 } 500 else 501 { 502 // We need to go backwards from our input point 503 // 504 while (la++ < 0 && (ANTLR_UINT8*)nextChar > (ANTLR_UINT8*)input->get_data() ) 505 { 506 // Get the previous 16 bit character 507 // 508 ch = *--nextChar; 509 510 // If we found a low surrogate then go back one more character if 511 // the hi surrogate is there 512 // 513 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 514 { 515 ch2 = *(nextChar-1); 516 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 517 { 518 // Yes, there is a high surrogate to match it so decrement one more and point to that 519 // 520 nextChar--; 521 } 522 } 523 } 524 } 525 526 // Our local copy of nextChar is now pointing to either the correct character or end of file 527 // 528 // Input buffer size is always in bytes 529 // 530 if ( (ANTLR_UINT8*)nextChar >= (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() )) 531 { 532 return ANTLR_CHARSTREAM_EOF; 533 } 534 else 535 { 536 // Pick up the next 16 character (native machine byte order) 537 // 538 ch = *nextChar++; 539 540 // If we have a surrogate pair then we need to consume 541 // a following valid LO surrogate. 542 // 543 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 544 { 545 // If the 16 bits following the high surrogate are in the source buffer... 546 // 547 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 548 { 549 // Next character is in natural machine byte order 550 // 551 ch2 = *nextChar; 552 553 // If it's a valid low surrogate, consume it 554 // 555 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 556 { 557 // Construct the UTF32 code point 558 // 559 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 560 + (ch2 - UNI_SUR_LOW_START) + halfBase; 561 } 562 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 563 // it. 564 // 565 } 566 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 567 // it because the buffer ended 568 // 569 } 570 } 571 return ch; 572 } 573 574 template<class ImplTraits, class SuperType> 575 ANTLR_UINT32 UTF16_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<ANTLR_LITTLE_ENDIAN> ) 576 { 577 SuperType* input; 578 UTF32 ch; 579 UTF32 ch2; 580 ANTLR_UCHAR* nextChar; 581 582 // Find the input interface and where we are currently pointing to 583 // in the input stream 584 // 585 input = this->get_super(); 586 nextChar = input->get_nextChar(); 587 588 // If a positive offset then advance forward, else retreat 589 // 590 if (la >= 0) 591 { 592 while (--la > 0 && (ANTLR_UINT8*)nextChar < ((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() ) 593 { 594 // Advance our copy of the input pointer 595 // 596 // Next char in Little Endian byte order 597 // 598 ch = (*nextChar) + (*(nextChar+1) << 8); 599 nextChar += 2; 600 601 // If we have a surrogate pair then we need to consume 602 // a following valid LO surrogate. 603 // 604 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 605 { 606 // If the 16 bits following the high surrogate are in the source buffer... 607 // 608 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() )) 609 { 610 // Next character is in little endian byte order 611 // 612 ch2 = (*nextChar) + (*(nextChar+1) << 8); 613 614 // If it's a valid low surrogate, consume it 615 // 616 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 617 { 618 // We consumed one 16 bit character 619 // 620 nextChar += 2; 621 } 622 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 623 // it. 624 // 625 } 626 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 627 // it because the buffer ended 628 // 629 } 630 // Note that we did not check for an invalid low surrogate here, or that fact that the 631 // lo surrogate was missing. We just picked out one 16 bit character unless the character 632 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 633 // 634 } 635 } 636 else 637 { 638 // We need to go backwards from our input point 639 // 640 while (la++ < 0 && (ANTLR_UINT8*)nextChar > (ANTLR_UINT8*)input->get_data() ) 641 { 642 // Get the previous 16 bit character 643 // 644 ch = (*nextChar - 2) + ((*nextChar -1) << 8); 645 nextChar -= 2; 646 647 // If we found a low surrogate then go back one more character if 648 // the hi surrogate is there 649 // 650 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 651 { 652 ch2 = (*nextChar - 2) + ((*nextChar -1) << 8); 653 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 654 { 655 // Yes, there is a high surrogate to match it so decrement one more and point to that 656 // 657 nextChar -=2; 658 } 659 } 660 } 661 } 662 663 // Our local copy of nextChar is now pointing to either the correct character or end of file 664 // 665 // Input buffer size is always in bytes 666 // 667 if ( (ANTLR_UINT8*)nextChar >= (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 668 { 669 return ANTLR_CHARSTREAM_EOF; 670 } 671 else 672 { 673 // Pick up the next 16 character (little endian byte order) 674 // 675 ch = (*nextChar) + (*(nextChar+1) << 8); 676 nextChar += 2; 677 678 // If we have a surrogate pair then we need to consume 679 // a following valid LO surrogate. 680 // 681 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 682 { 683 // If the 16 bits following the high surrogate are in the source buffer... 684 // 685 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 686 { 687 // Next character is in little endian byte order 688 // 689 ch2 = (*nextChar) + (*(nextChar+1) << 8); 690 691 // If it's a valid low surrogate, consume it 692 // 693 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 694 { 695 // Construct the UTF32 code point 696 // 697 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 698 + (ch2 - UNI_SUR_LOW_START) + halfBase; 699 } 700 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 701 // it. 702 // 703 } 704 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 705 // it because the buffer ended 706 // 707 } 708 } 709 return ch; 710 } 711 712 template<class ImplTraits, class SuperType> 713 ANTLR_UINT32 UTF16_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<ANTLR_BIG_ENDIAN> ) 714 { 715 SuperType* input; 716 UTF32 ch; 717 UTF32 ch2; 718 ANTLR_UCHAR* nextChar; 719 720 // Find the input interface and where we are currently pointing to 721 // in the input stream 722 // 723 input = this->get_super(); 724 nextChar = input->get_nextChar(); 725 726 // If a positive offset then advance forward, else retreat 727 // 728 if (la >= 0) 729 { 730 while (--la > 0 && (ANTLR_UINT8*)nextChar < ((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf() ) 731 { 732 // Advance our copy of the input pointer 733 // 734 // Next char in Big Endian byte order 735 // 736 ch = ((*nextChar) << 8) + *(nextChar+1); 737 nextChar += 2; 738 739 // If we have a surrogate pair then we need to consume 740 // a following valid LO surrogate. 741 // 742 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 743 { 744 // If the 16 bits following the high surrogate are in the source buffer... 745 // 746 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 747 { 748 // Next character is in big endian byte order 749 // 750 ch2 = ((*nextChar) << 8) + *(nextChar+1); 751 752 // If it's a valid low surrogate, consume it 753 // 754 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 755 { 756 // We consumed one 16 bit character 757 // 758 nextChar += 2; 759 } 760 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 761 // it. 762 // 763 } 764 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 765 // it because the buffer ended 766 // 767 } 768 // Note that we did not check for an invalid low surrogate here, or that fact that the 769 // lo surrogate was missing. We just picked out one 16 bit character unless the character 770 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 771 // 772 } 773 } 774 else 775 { 776 // We need to go backwards from our input point 777 // 778 while (la++ < 0 && (ANTLR_UINT8*)nextChar > (ANTLR_UINT8*)input->get_data() ) 779 { 780 // Get the previous 16 bit character 781 // 782 ch = ((*nextChar - 2) << 8) + (*nextChar -1); 783 nextChar -= 2; 784 785 // If we found a low surrogate then go back one more character if 786 // the hi surrogate is there 787 // 788 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) 789 { 790 ch2 = ((*nextChar - 2) << 8) + (*nextChar -1); 791 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END) 792 { 793 // Yes, there is a high surrogate to match it so decrement one more and point to that 794 // 795 nextChar -=2; 796 } 797 } 798 } 799 } 800 801 // Our local copy of nextChar is now pointing to either the correct character or end of file 802 // 803 // Input buffer size is always in bytes 804 // 805 if ( (ANTLR_UINT8*)nextChar >= (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 806 { 807 return ANTLR_CHARSTREAM_EOF; 808 } 809 else 810 { 811 // Pick up the next 16 character (big endian byte order) 812 // 813 ch = ((*nextChar) << 8) + *(nextChar+1); 814 nextChar += 2; 815 816 // If we have a surrogate pair then we need to consume 817 // a following valid LO surrogate. 818 // 819 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 820 { 821 // If the 16 bits following the high surrogate are in the source buffer... 822 // 823 if ((ANTLR_UINT8*)(nextChar) < (((ANTLR_UINT8*)input->get_data()) + input->get_sizeBuf())) 824 { 825 // Next character is in big endian byte order 826 // 827 ch2 = ((*nextChar) << 8) + *(nextChar+1); 828 829 // If it's a valid low surrogate, consume it 830 // 831 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 832 { 833 // Construct the UTF32 code point 834 // 835 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 836 + (ch2 - UNI_SUR_LOW_START) + halfBase; 837 } 838 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 839 // it. 840 // 841 } 842 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 843 // it because the buffer ended 844 // 845 } 846 } 847 return ch; 848 } 849 850 template<class ImplTraits, class SuperType> 851 void UTF16_IntStream<ImplTraits, SuperType>::consume( ClassForwarder<BYTE_AGNOSTIC> ) 852 { 853 SuperType* input; 854 UTF32 ch; 855 UTF32 ch2; 856 857 input = this->get_super(); 858 859 // Buffer size is always in bytes 860 // 861 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 862 { 863 // Indicate one more character in this line 864 // 865 input->inc_charPositionInLine(); 866 867 if ((ANTLR_UCHAR)(*(input->get_nextChar())) == input->get_newlineChar()) 868 { 869 // Reset for start of a new line of input 870 // 871 input->inc_line(); 872 input->set_charPositionInLine(0); 873 input->set_currentLine( input->get_nextChar() + 1 ); 874 } 875 876 // Increment to next character position, accounting for any surrogates 877 // 878 // Next char in natural machine byte order 879 // 880 ch = *(input->get_nextChar()); 881 882 // We consumed one 16 bit character 883 // 884 input->set_nextChar( input->get_nextChar() + 1 ); 885 886 // If we have a surrogate pair then we need to consume 887 // a following valid LO surrogate. 888 // 889 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 890 891 // If the 16 bits following the high surrogate are in the source buffer... 892 // 893 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 894 { 895 // Next character is in natural machine byte order 896 // 897 ch2 = *(input->get_nextChar()); 898 899 // If it's a valid low surrogate, consume it 900 // 901 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 902 { 903 // We consumed one 16 bit character 904 // 905 input->set_nextChar( input->get_nextChar() + 1 ); 906 } 907 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 908 // it. 909 // 910 } 911 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 912 // it because the buffer ended 913 // 914 } 915 // Note that we did not check for an invalid low surrogate here, or that fact that the 916 // lo surrogate was missing. We just picked out one 16 bit character unless the character 917 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 918 // 919 } 920 921 } 922 923 template<class ImplTraits, class SuperType> 924 void UTF16_IntStream<ImplTraits, SuperType>::consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> ) 925 { 926 SuperType* input; 927 UTF32 ch; 928 UTF32 ch2; 929 930 input = this->get_super(); 931 932 // Buffer size is always in bytes 933 // 934 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 935 { 936 // Indicate one more character in this line 937 // 938 input->inc_charPositionInLine(); 939 940 if ((ANTLR_UCHAR)(*(input->get_nextChar())) == input->get_newlineChar()) 941 { 942 // Reset for start of a new line of input 943 // 944 input->inc_line(); 945 input->set_charPositionInLine(0); 946 input->set_currentLine(input->get_nextChar() + 1); 947 } 948 949 // Increment to next character position, accounting for any surrogates 950 // 951 // Next char in litle endian form 952 // 953 ch = *((ANTLR_UINT8*)input->get_nextChar()) + (*((ANTLR_UINT8*)input->get_nextChar() + 1) <<8); 954 955 // We consumed one 16 bit character 956 // 957 input->set_nextChar( input->get_nextChar() + 1); 958 959 // If we have a surrogate pair then we need to consume 960 // a following valid LO surrogate. 961 // 962 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 963 { 964 // If the 16 bits following the high surrogate are in the source buffer... 965 // 966 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 967 { 968 ch2 = *((ANTLR_UINT8*)input->get_nextChar()) + (*((ANTLR_UINT8*)input->get_nextChar() + 1) <<8); 969 970 // If it's a valid low surrogate, consume it 971 // 972 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 973 { 974 // We consumed one 16 bit character 975 // 976 input->set_nextChar( input->get_nextChar() + 1); 977 } 978 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 979 // it. 980 // 981 } 982 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 983 // it because the buffer ended 984 // 985 } 986 // Note that we did not check for an invalid low surrogate here, or that fact that the 987 // lo surrogate was missing. We just picked out one 16 bit character unless the character 988 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 989 // 990 } 991 } 992 993 template<class ImplTraits, class SuperType> 994 void UTF16_IntStream<ImplTraits, SuperType>::consume( ClassForwarder<ANTLR_BIG_ENDIAN> ) 995 { 996 SuperType* input; 997 UTF32 ch; 998 UTF32 ch2; 999 1000 input = this->get_super(); 1001 1002 // Buffer size is always in bytes 1003 // 1004 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 1005 { 1006 // Indicate one more character in this line 1007 // 1008 input->inc_charPositionInLine(); 1009 1010 if ((ANTLR_UCHAR)(*(input->get_nextChar())) == input->get_newlineChar()) 1011 { 1012 // Reset for start of a new line of input 1013 // 1014 input->inc_line(); 1015 input->set_charPositionInLine(0); 1016 input->set_currentLine(input->get_nextChar() + 1); 1017 } 1018 1019 // Increment to next character position, accounting for any surrogates 1020 // 1021 // Next char in big endian form 1022 // 1023 ch = *((ANTLR_UINT8*)input->get_nextChar() + 1) + (*((ANTLR_UINT8*)input->get_nextChar() ) <<8); 1024 1025 // We consumed one 16 bit character 1026 // 1027 input->set_nextChar( input->get_nextChar() + 1); 1028 1029 // If we have a surrogate pair then we need to consume 1030 // a following valid LO surrogate. 1031 // 1032 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) 1033 { 1034 // If the 16 bits following the high surrogate are in the source buffer... 1035 // 1036 if(input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/2) ) 1037 { 1038 // Big endian 1039 // 1040 ch2 = *((ANTLR_UINT8*)input->get_nextChar() + 1) + (*((ANTLR_UINT8*)input->get_nextChar() ) <<8); 1041 1042 // If it's a valid low surrogate, consume it 1043 // 1044 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) 1045 { 1046 // We consumed one 16 bit character 1047 // 1048 input->set_nextChar( input->get_nextChar() + 1); 1049 } 1050 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1051 // it. 1052 // 1053 } 1054 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with 1055 // it because the buffer ended 1056 // 1057 } 1058 // Note that we did not check for an invalid low surrogate here, or that fact that the 1059 // lo surrogate was missing. We just picked out one 16 bit character unless the character 1060 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters. 1061 // 1062 } 1063 } 1064 1065 template<class ImplTraits, class SuperType> 1066 ANTLR_UINT32 UTF32_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 i) 1067 { 1068 return this->_LA( i, ClassForwarder<typename ImplTraits::Endianness>() ); 1069 } 1070 1071 template<class ImplTraits, class SuperType> 1072 ANTLR_MARKER UTF32_IntStream<ImplTraits, SuperType>::index() 1073 { 1074 SuperType* input = this->get_super(); 1075 return (ANTLR_MARKER)(input->get_nextChar()); 1076 } 1077 1078 template<class ImplTraits, class SuperType> 1079 void UTF32_IntStream<ImplTraits, SuperType>::seek(ANTLR_MARKER seekPoint) 1080 { 1081 SuperType* input; 1082 1083 input = this->get_super(); 1084 1085 // If the requested seek point is less than the current 1086 // input point, then we assume that we are resetting from a mark 1087 // and do not need to scan, but can just set to there as rewind will 1088 // reset line numbers and so on. 1089 // 1090 if (seekPoint <= (ANTLR_MARKER)(input->get_nextChar())) 1091 { 1092 input->set_nextChar( static_cast<typename ImplTraits::DataType*>(seekPoint) ); 1093 } 1094 else 1095 { 1096 // Call consume until we reach the asked for seek point or EOF 1097 // 1098 while( (this->_LA(1) != ANTLR_CHARSTREAM_EOF) && (seekPoint < (ANTLR_MARKER)input->get_nextChar()) ) 1099 { 1100 this->consume(); 1101 } 1102 } 1103 1104 } 1105 1106 template<class ImplTraits, class SuperType> 1107 void UTF32_IntStream<ImplTraits, SuperType>::setupIntStream(bool machineBigEndian, bool inputBigEndian) 1108 { 1109 SuperType* super = this->get_super(); 1110 super->set_charByteSize(4); 1111 1112 this->findout_endian_spec(machineBigEndian, inputBigEndian); 1113 } 1114 1115 template<class ImplTraits, class SuperType> 1116 ANTLR_UINT32 UTF32_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<BYTE_AGNOSTIC> ) 1117 { 1118 SuperType* input = this->get_super(); 1119 1120 if (( input->get_nextChar() + la - 1) >= (input->get_data() + input->get_sizeBuf()/4 )) 1121 { 1122 return ANTLR_CHARSTREAM_EOF; 1123 } 1124 else 1125 { 1126 return (ANTLR_UCHAR)(*(input->get_nextChar() + la - 1)); 1127 } 1128 } 1129 1130 template<class ImplTraits, class SuperType> 1131 ANTLR_UINT32 UTF32_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<ANTLR_LITTLE_ENDIAN> ) 1132 { 1133 SuperType* input = this->get_super(); 1134 1135 if (( input->get_nextChar() + la - 1) >= (input->get_data() + input->get_sizeBuf()/4 )) 1136 { 1137 return ANTLR_CHARSTREAM_EOF; 1138 } 1139 else 1140 { 1141 ANTLR_UCHAR c; 1142 1143 c = (ANTLR_UCHAR)(*(input->get_nextChar() + la - 1)); 1144 1145 // Swap Endianess to Big Endian 1146 // 1147 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24); 1148 } 1149 } 1150 1151 template<class ImplTraits, class SuperType> 1152 ANTLR_UINT32 UTF32_IntStream<ImplTraits, SuperType>::_LA( ANTLR_INT32 la, ClassForwarder<ANTLR_BIG_ENDIAN> ) 1153 { 1154 SuperType* input = this->get_super(); 1155 1156 if (( input->get_nextChar() + la - 1) >= (input->get_data() + input->get_sizeBuf()/4 )) 1157 { 1158 return ANTLR_CHARSTREAM_EOF; 1159 } 1160 else 1161 { 1162 ANTLR_UCHAR c; 1163 1164 c = (ANTLR_UCHAR)(*(input->get_nextChar() + la - 1)); 1165 1166 // Swap Endianess to Little Endian 1167 // 1168 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24); 1169 } 1170 } 1171 1172 template<class ImplTraits, class SuperType> 1173 void UTF32_IntStream<ImplTraits, SuperType>::consume() 1174 { 1175 SuperType* input = this->get_super(); 1176 1177 // SizeBuf is always in bytes 1178 // 1179 if ( input->get_nextChar() < (input->get_data() + input->get_sizeBuf()/4 )) 1180 { 1181 /* Indicate one more character in this line 1182 */ 1183 input->inc_charPositionInLine(); 1184 1185 if ((ANTLR_UCHAR)(*(input->get_nextChar())) == input->get_newlineChar()) 1186 { 1187 /* Reset for start of a new line of input 1188 */ 1189 input->inc_line(); 1190 input->set_charPositionInLine(0); 1191 input->set_currentLine( input->get_nextChar() + 1 ); 1192 } 1193 1194 /* Increment to next character position 1195 */ 1196 input->set_nextChar( input->get_nextChar() + 1 ); 1197 } 1198 } 1199 1200 template<class ImplTraits, class SuperType> 1201 void UTF8_IntStream<ImplTraits, SuperType>::setupIntStream(bool, bool) 1202 { 1203 SuperType* super = this->get_super(); 1204 super->set_charByteSize(0); 1205 } 1206 1207 // ------------------------------------------------------ 1208 // Following is from Unicode.org (see antlr3convertutf.c) 1209 // 1210 1211 /// Index into the table below with the first byte of a UTF-8 sequence to 1212 /// get the number of trailing bytes that are supposed to follow it. 1213 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 1214 /// left as-is for anyone who may want to do such conversion, which was 1215 /// allowed in earlier algorithms. 1216 /// 1217 template<class ImplTraits, class SuperType> 1218 const ANTLR_UINT32* UTF8_IntStream<ImplTraits, SuperType>::TrailingBytesForUTF8() 1219 { 1220 static const ANTLR_UINT32 trailingBytesForUTF8[256] = { 1221 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1222 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1223 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1224 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1225 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1226 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1227 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1228 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 1229 }; 1230 1231 return trailingBytesForUTF8; 1232 } 1233 1234 /// Magic values subtracted from a buffer value during UTF8 conversion. 1235 /// This table contains as many values as there might be trailing bytes 1236 /// in a UTF-8 sequence. 1237 /// 1238 template<class ImplTraits, class SuperType> 1239 const UTF32* UTF8_IntStream<ImplTraits, SuperType>::OffsetsFromUTF8() 1240 { 1241 static const UTF32 offsetsFromUTF8[6] = 1242 { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 1243 0x03C82080UL, 0xFA082080UL, 0x82082080UL 1244 }; 1245 return offsetsFromUTF8; 1246 } 1247 1248 // End of Unicode.org tables 1249 // ------------------------- 1250 1251 1252 /** \brief Consume the next character in a UTF8 input stream 1253 * 1254 * \param input Input stream context pointer 1255 */ 1256 template<class ImplTraits, class SuperType> 1257 void UTF8_IntStream<ImplTraits, SuperType>::consume() 1258 { 1259 SuperType* input = this->get_super(); 1260 const ANTLR_UINT32* trailingBytesForUTF8 = UTF8_IntStream::TrailingBytesForUTF8(); 1261 const UTF32* offsetsFromUTF8 = UTF8_IntStream::OffsetsFromUTF8(); 1262 1263 ANTLR_UINT32 extraBytesToRead; 1264 ANTLR_UCHAR ch; 1265 ANTLR_UINT8* nextChar; 1266 1267 nextChar = input->get_nextChar(); 1268 1269 if (nextChar < (input->get_data() + input->get_sizeBuf())) 1270 { 1271 // Indicate one more character in this line 1272 // 1273 input->inc_charPositionInLine(); 1274 1275 // Are there more bytes needed to make up the whole thing? 1276 // 1277 extraBytesToRead = trailingBytesForUTF8[*nextChar]; 1278 1279 if ((nextChar + extraBytesToRead) >= (input->get_data() + input->get_sizeBuf())) 1280 { 1281 input->set_nextChar( input->get_data() + input->get_sizeBuf() ); 1282 return; 1283 } 1284 1285 // Cases deliberately fall through (see note A in antlrconvertutf.c) 1286 // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so 1287 // we allow it. 1288 // 1289 ch = 0; 1290 switch (extraBytesToRead) 1291 { 1292 case 5: ch += *nextChar++; ch <<= 6; 1293 case 4: ch += *nextChar++; ch <<= 6; 1294 case 3: ch += *nextChar++; ch <<= 6; 1295 case 2: ch += *nextChar++; ch <<= 6; 1296 case 1: ch += *nextChar++; ch <<= 6; 1297 case 0: ch += *nextChar++; 1298 } 1299 1300 // Magically correct the input value 1301 // 1302 ch -= offsetsFromUTF8[extraBytesToRead]; 1303 if (ch == input->get_newlineChar()) 1304 { 1305 /* Reset for start of a new line of input 1306 */ 1307 input->inc_line(); 1308 input->set_charPositionInLine(0); 1309 input->set_currentLine(nextChar); 1310 } 1311 1312 // Update input pointer 1313 // 1314 input->set_nextChar(nextChar); 1315 } 1316 } 1317 1318 /** \brief Return the input element assuming a UTF8 input 1319 * 1320 * \param[in] input Input stream context pointer 1321 * \param[in] la 1 based offset of next input stream element 1322 * 1323 * \return Next input character in internal ANTLR3 encoding (UTF32) 1324 */ 1325 template<class ImplTraits, class SuperType> 1326 ANTLR_UCHAR UTF8_IntStream<ImplTraits, SuperType>::_LA(ANTLR_INT32 la) 1327 { 1328 SuperType* input = this->get_super(); 1329 const ANTLR_UINT32* trailingBytesForUTF8 = UTF8_IntStream::TrailingBytesForUTF8(); 1330 const UTF32* offsetsFromUTF8 = UTF8_IntStream::OffsetsFromUTF8(); 1331 ANTLR_UINT32 extraBytesToRead; 1332 ANTLR_UCHAR ch; 1333 ANTLR_UINT8* nextChar; 1334 1335 nextChar = input->get_nextChar(); 1336 1337 // Do we need to traverse forwards or backwards? 1338 // - LA(0) is treated as LA(1) and we assume that the nextChar is 1339 // already positioned. 1340 // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding 1341 // - LA(-n) means we must traverse backwards n chracters 1342 // 1343 if (la > 1) { 1344 1345 // Make sure that we have at least one character left before trying to 1346 // loop through the buffer. 1347 // 1348 if (nextChar < (input->get_data() + input->get_sizeBuf())) 1349 { 1350 // Now traverse n-1 characters forward 1351 // 1352 while (--la > 0) 1353 { 1354 // Does the next character require trailing bytes? 1355 // If so advance the pointer by that many bytes as well as advancing 1356 // one position for what will be at least a single byte character. 1357 // 1358 nextChar += trailingBytesForUTF8[*nextChar] + 1; 1359 1360 // Does that calculation take us past the byte length of the buffer? 1361 // 1362 if (nextChar >= (input->get_data() + input->get_sizeBuf())) 1363 { 1364 return ANTLR_CHARSTREAM_EOF; 1365 } 1366 } 1367 } 1368 else 1369 { 1370 return ANTLR_CHARSTREAM_EOF; 1371 } 1372 } 1373 else 1374 { 1375 // LA is negative so we decrease the pointer by n character positions 1376 // 1377 while (nextChar > input->get_data() && la++ < 0) 1378 { 1379 // Traversing backwards in UTF8 means decermenting by one 1380 // then continuing to decrement while ever a character pattern 1381 // is flagged as being a trailing byte of an encoded code point. 1382 // Trailing UTF8 bytes always start with 10 in binary. We assumne that 1383 // the UTF8 is well formed and do not check boundary conditions 1384 // 1385 nextChar--; 1386 while ((*nextChar & 0xC0) == 0x80) 1387 { 1388 nextChar--; 1389 } 1390 } 1391 } 1392 1393 // nextChar is now pointing at the UTF8 encoded character that we need to 1394 // decode and return. 1395 // 1396 // Are there more bytes needed to make up the whole thing? 1397 // 1398 extraBytesToRead = trailingBytesForUTF8[*nextChar]; 1399 if (nextChar + extraBytesToRead >= (input->get_data() + input->get_sizeBuf())) 1400 { 1401 return ANTLR_CHARSTREAM_EOF; 1402 } 1403 1404 // Cases deliberately fall through (see note A in antlrconvertutf.c) 1405 // 1406 ch = 0; 1407 switch (extraBytesToRead) 1408 { 1409 case 5: ch += *nextChar++; ch <<= 6; 1410 case 4: ch += *nextChar++; ch <<= 6; 1411 case 3: ch += *nextChar++; ch <<= 6; 1412 case 2: ch += *nextChar++; ch <<= 6; 1413 case 1: ch += *nextChar++; ch <<= 6; 1414 case 0: ch += *nextChar++; 1415 } 1416 1417 // Magically correct the input value 1418 // 1419 ch -= offsetsFromUTF8[extraBytesToRead]; 1420 1421 return ch; 1422 } 1423 1424 template<class ImplTraits> 1425 TokenIntStream<ImplTraits>::TokenIntStream() 1426 { 1427 m_cachedSize = 0; 1428 } 1429 1430 template<class ImplTraits> 1431 ANTLR_UINT32 TokenIntStream<ImplTraits>::get_cachedSize() const 1432 { 1433 return m_cachedSize; 1434 } 1435 1436 template<class ImplTraits> 1437 void TokenIntStream<ImplTraits>::set_cachedSize( ANTLR_UINT32 cachedSize ) 1438 { 1439 m_cachedSize = cachedSize; 1440 } 1441 1442 /** Move the input pointer to the next incoming token. The stream 1443 * must become active with LT(1) available. consume() simply 1444 * moves the input pointer so that LT(1) points at the next 1445 * input symbol. Consume at least one token. 1446 * 1447 * Walk past any token not on the channel the parser is listening to. 1448 */ 1449 template<class ImplTraits> 1450 void TokenIntStream<ImplTraits>::consume() 1451 { 1452 TokenStreamType* cts = static_cast<TokenStreamType*>(this); 1453 1454 if((ANTLR_UINT32)cts->get_p() < m_cachedSize ) 1455 { 1456 cts->inc_p(); 1457 cts->set_p( cts->skipOffTokenChannels(cts->get_p()) ); 1458 } 1459 } 1460 template<class ImplTraits> 1461 void TokenIntStream<ImplTraits>::consumeInitialHiddenTokens() 1462 { 1463 ANTLR_MARKER first; 1464 ANTLR_INT32 i; 1465 TokenStreamType* ts; 1466 1467 ts = this->get_super(); 1468 first = this->index(); 1469 1470 for (i=0; i<first; i++) 1471 { 1472 ts->get_debugger()->consumeHiddenToken(ts->get(i)); 1473 } 1474 1475 ts->set_initialStreamState(false); 1476 } 1477 1478 1479 template<class ImplTraits> 1480 ANTLR_UINT32 TokenIntStream<ImplTraits>::_LA( ANTLR_INT32 i ) 1481 { 1482 const CommonTokenType* tok; 1483 TokenStreamType* ts = static_cast<TokenStreamType*>(this); 1484 1485 tok = ts->_LT(i); 1486 1487 if (tok != NULL) 1488 { 1489 return tok->get_type(); 1490 } 1491 else 1492 { 1493 return CommonTokenType::TOKEN_INVALID; 1494 } 1495 1496 } 1497 1498 template<class ImplTraits> 1499 ANTLR_MARKER TokenIntStream<ImplTraits>::mark() 1500 { 1501 BaseType::m_lastMarker = this->index(); 1502 return BaseType::m_lastMarker; 1503 } 1504 1505 template<class ImplTraits> 1506 ANTLR_UINT32 TokenIntStream<ImplTraits>::size() 1507 { 1508 if (this->get_cachedSize() > 0) 1509 { 1510 return this->get_cachedSize(); 1511 } 1512 TokenStreamType* cts = this->get_super(); 1513 1514 this->set_cachedSize( static_cast<ANTLR_UINT32>(cts->get_tokens().size()) ); 1515 return this->get_cachedSize(); 1516 } 1517 1518 template<class ImplTraits> 1519 void TokenIntStream<ImplTraits>::release() 1520 { 1521 return; 1522 } 1523 1524 template<class ImplTraits> 1525 ANTLR_MARKER TokenIntStream<ImplTraits>::tindex() 1526 { 1527 return this->get_super()->get_p(); 1528 } 1529 1530 template<class ImplTraits> 1531 void TokenIntStream<ImplTraits>::rewindLast() 1532 { 1533 this->rewind( this->get_lastMarker() ); 1534 } 1535 1536 template<class ImplTraits> 1537 void TokenIntStream<ImplTraits>::rewind(ANTLR_MARKER marker) 1538 { 1539 return this->seek(marker); 1540 } 1541 1542 template<class ImplTraits> 1543 void TokenIntStream<ImplTraits>::seek(ANTLR_MARKER index) 1544 { 1545 TokenStreamType* cts = static_cast<TokenStreamType*>(this); 1546 1547 cts->set_p( static_cast<ANTLR_INT32>(index) ); 1548 } 1549 1550 1551 /// Return a string that represents the name assoicated with the input source 1552 /// 1553 /// /param[in] is The ANTLR3_INT_STREAM interface that is representing this token stream. 1554 /// 1555 /// /returns 1556 /// /implements ANTLR3_INT_STREAM_struct::getSourceName() 1557 /// 1558 template<class ImplTraits> 1559 typename TokenIntStream<ImplTraits>::StringType 1560 TokenIntStream<ImplTraits>::getSourceName() 1561 { 1562 // Slightly convoluted as we must trace back to the lexer's input source 1563 // via the token source. The streamName that is here is not initialized 1564 // because this is a token stream, not a file or string stream, which are the 1565 // only things that have a context for a source name. 1566 // 1567 return this->get_super()->get_tokenSource()->get_fileName(); 1568 } 1569 1570 template<class ImplTraits> 1571 void TreeNodeIntStream<ImplTraits>::consume() 1572 { 1573 CommonTreeNodeStreamType* ctns = this->get_super(); 1574 if( ctns->get_p() == -1 ) 1575 ctns->fillBufferRoot(); 1576 ctns->inc_p(); 1577 } 1578 template<class ImplTraits> 1579 ANTLR_MARKER TreeNodeIntStream<ImplTraits>::tindex() 1580 { 1581 CommonTreeNodeStreamType* ctns = this->get_super(); 1582 return (ANTLR_MARKER)(ctns->get_p()); 1583 } 1584 1585 template<class ImplTraits> 1586 ANTLR_UINT32 TreeNodeIntStream<ImplTraits>::_LA(ANTLR_INT32 i) 1587 { 1588 CommonTreeNodeStreamType* tns = this->get_super(); 1589 1590 // Ask LT for the 'token' at that position 1591 // 1592 TreeType* t = tns->_LT(i); 1593 1594 if (t == NULL) 1595 { 1596 return CommonTokenType::TOKEN_INVALID; 1597 } 1598 1599 // Token node was there so return the type of it 1600 // 1601 return t->get_type(); 1602 } 1603 1604 template<class ImplTraits> 1605 ANTLR_MARKER TreeNodeIntStream<ImplTraits>::mark() 1606 { 1607 CommonTreeNodeStreamType* ctns = this->get_super(); 1608 1609 if (ctns->get_p() == -1) 1610 { 1611 ctns->fillBufferRoot(); 1612 } 1613 1614 // Return the current mark point 1615 // 1616 this->set_lastMarker( this->index() ); 1617 1618 return this->get_lastMarker(); 1619 1620 } 1621 1622 template<class ImplTraits> 1623 void TreeNodeIntStream<ImplTraits>::release(ANTLR_MARKER marker) 1624 { 1625 1626 } 1627 1628 template<class ImplTraits> 1629 void TreeNodeIntStream<ImplTraits>::rewindMark(ANTLR_MARKER marker) 1630 { 1631 this->seek(marker); 1632 } 1633 1634 template<class ImplTraits> 1635 void TreeNodeIntStream<ImplTraits>::rewindLast() 1636 { 1637 this->seek( this->get_lastMarker() ); 1638 } 1639 1640 template<class ImplTraits> 1641 void TreeNodeIntStream<ImplTraits>::seek(ANTLR_MARKER index) 1642 { 1643 CommonTreeNodeStreamType* ctns = this->get_super(); 1644 ctns->set_p( ANTLR_UINT32_CAST(index) ); 1645 } 1646 1647 template<class ImplTraits> 1648 ANTLR_UINT32 TreeNodeIntStream<ImplTraits>::size() 1649 { 1650 CommonTreeNodeStreamType* ctns = this->get_super(); 1651 1652 if (ctns->get_p() == -1) 1653 { 1654 ctns->fillBufferRoot(); 1655 } 1656 1657 return ctns->get_nodes().size(); 1658 } 1659 1660 1661 ANTLR_END_NAMESPACE() 1662