1 // 2 // ANTLRBaseRecognizer.m 3 // ANTLR 4 // 5 // Created by Alan Condit on 6/16/10. 6 // [The "BSD licence"] 7 // Copyright (c) 2010 Alan Condit 8 // All rights reserved. 9 // 10 // Redistribution and use in source and binary forms, with or without 11 // modification, are permitted provided that the following conditions 12 // are met: 13 // 1. Redistributions of source code must retain the above copyright 14 // notice, this list of conditions and the following disclaimer. 15 // 2. Redistributions in binary form must reproduce the above copyright 16 // notice, this list of conditions and the following disclaimer in the 17 // documentation and/or other materials provided with the distribution. 18 // 3. The name of the author may not be used to endorse or promote products 19 // derived from this software without specific prior written permission. 20 // 21 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 32 #import "ANTLRBaseRecognizer.h" 33 #import "ANTLRHashRule.h" 34 #import "ANTLRRuleMemo.h" 35 #import "ANTLRCommonToken.h" 36 #import "ANTLRMap.h" 37 38 extern NSInteger debug; 39 40 @implementation ANTLRBaseRecognizer 41 42 static AMutableArray *_tokenNames; 43 static NSString *_grammarFileName; 44 static NSString *NEXT_TOKEN_RULE_NAME; 45 46 @synthesize state; 47 @synthesize grammarFileName; 48 //@synthesize failed; 49 @synthesize sourceName; 50 //@synthesize numberOfSyntaxErrors; 51 @synthesize tokenNames; 52 53 + (void) initialize 54 { 55 NEXT_TOKEN_RULE_NAME = [NSString stringWithString:@"nextToken"]; 56 [NEXT_TOKEN_RULE_NAME retain]; 57 } 58 59 + (ANTLRBaseRecognizer *) newANTLRBaseRecognizer 60 { 61 return [[ANTLRBaseRecognizer alloc] init]; 62 } 63 64 + (ANTLRBaseRecognizer *) newANTLRBaseRecognizerWithRuleLen:(NSInteger)aLen 65 { 66 return [[ANTLRBaseRecognizer alloc] initWithLen:aLen]; 67 } 68 69 + (ANTLRBaseRecognizer *) newANTLRBaseRecognizer:(ANTLRRecognizerSharedState *)aState 70 { 71 return [[ANTLRBaseRecognizer alloc] initWithState:aState]; 72 } 73 74 + (AMutableArray *)getTokenNames 75 { 76 return _tokenNames; 77 } 78 79 + (void)setTokenNames:(AMutableArray *)theTokNams 80 { 81 if ( _tokenNames != theTokNams ) { 82 if ( _tokenNames ) [_tokenNames release]; 83 [theTokNams retain]; 84 } 85 _tokenNames = theTokNams; 86 } 87 88 + (void)setGrammarFileName:(NSString *)aFileName 89 { 90 if ( _grammarFileName != aFileName ) { 91 if ( _grammarFileName ) [_grammarFileName release]; 92 [aFileName retain]; 93 } 94 [_grammarFileName retain]; 95 } 96 97 - (id) init 98 { 99 if ((self = [super init]) != nil) { 100 if (state == nil) { 101 state = [[ANTLRRecognizerSharedState newANTLRRecognizerSharedState] retain]; 102 } 103 tokenNames = _tokenNames; 104 if ( tokenNames ) [tokenNames retain]; 105 grammarFileName = _grammarFileName; 106 if ( grammarFileName ) [grammarFileName retain]; 107 state._fsp = -1; 108 state.errorRecovery = NO; // are we recovering? 109 state.lastErrorIndex = -1; 110 state.failed = NO; // indicate that some match failed 111 state.syntaxErrors = 0; 112 state.backtracking = 0; // the level of backtracking 113 state.tokenStartCharIndex = -1; 114 } 115 return self; 116 } 117 118 - (id) initWithLen:(NSInteger)aLen 119 { 120 if ((self = [super init]) != nil) { 121 if (state == nil) { 122 state = [[ANTLRRecognizerSharedState newANTLRRecognizerSharedStateWithRuleLen:aLen] retain]; 123 } 124 tokenNames = _tokenNames; 125 if ( tokenNames ) [tokenNames retain]; 126 grammarFileName = _grammarFileName; 127 if ( grammarFileName ) [grammarFileName retain]; 128 state._fsp = -1; 129 state.errorRecovery = NO; // are we recovering? 130 state.lastErrorIndex = -1; 131 state.failed = NO; // indicate that some match failed 132 state.syntaxErrors = 0; 133 state.backtracking = 0; // the level of backtracking 134 state.tokenStartCharIndex = -1; 135 } 136 return self; 137 } 138 139 - (id) initWithState:(ANTLRRecognizerSharedState *)aState 140 { 141 if ((self = [super init]) != nil) { 142 state = aState; 143 if (state == nil) { 144 state = [ANTLRRecognizerSharedState newANTLRRecognizerSharedState]; 145 } 146 [state retain]; 147 tokenNames = _tokenNames; 148 if ( tokenNames ) [tokenNames retain]; 149 grammarFileName = _grammarFileName; 150 if ( grammarFileName ) [grammarFileName retain]; 151 state._fsp = -1; 152 state.errorRecovery = NO; // are we recovering? 153 state.lastErrorIndex = -1; 154 state.failed = NO; // indicate that some match failed 155 state.syntaxErrors = 0; 156 state.backtracking = 0; // the level of backtracking 157 state.tokenStartCharIndex = -1; 158 } 159 return self; 160 } 161 162 - (void)dealloc 163 { 164 #ifdef DEBUG_DEALLOC 165 NSLog( @"called dealloc in ANTLRBaseRecognizer" ); 166 #endif 167 if ( grammarFileName ) [grammarFileName release]; 168 if ( tokenNames ) [tokenNames release]; 169 if ( state ) [state release]; 170 [super dealloc]; 171 } 172 173 // reset the recognizer to the initial state. does not touch the token source! 174 // this can be extended by the grammar writer to reset custom ivars 175 - (void) reset 176 { 177 if ( state == nil ) 178 return; 179 if ( state.following != nil ) { 180 if ( [state.following count] ) 181 [state.following removeAllObjects]; 182 } 183 state._fsp = -1; 184 state.errorRecovery = NO; // are we recovering? 185 state.lastErrorIndex = -1; 186 state.failed = NO; // indicate that some match failed 187 state.syntaxErrors = 0; 188 state.backtracking = 0; // the level of backtracking 189 state.tokenStartCharIndex = -1; 190 if ( state.ruleMemo != nil ) { 191 if ( [state.ruleMemo count] ) 192 [state.ruleMemo removeAllObjects]; 193 } 194 } 195 196 - (BOOL) getFailed 197 { 198 return [state getFailed]; 199 } 200 201 - (void) setFailed:(BOOL)flag 202 { 203 [state setFailed:flag]; 204 } 205 206 - (ANTLRRecognizerSharedState *) getState 207 { 208 return state; 209 } 210 211 - (void) setState:(ANTLRRecognizerSharedState *) theState 212 { 213 if (state != theState) { 214 if ( state ) [state release]; 215 state = theState; 216 [state retain]; 217 } 218 } 219 220 - (id)input 221 { 222 return nil; // Must be overriden in inheriting class 223 } 224 225 - (void)skip // override in inheriting class 226 { 227 return; 228 } 229 230 -(id) match:(id<ANTLRIntStream>)anInput TokenType:(NSInteger)ttype Follow:(ANTLRBitSet *)follow 231 { 232 id matchedSymbol = [self getCurrentInputSymbol:anInput]; 233 if ([anInput LA:1] == ttype) { 234 [anInput consume]; 235 state.errorRecovery = NO; 236 state.failed = NO; 237 return matchedSymbol; 238 } 239 if (state.backtracking > 0) { 240 state.failed = YES; 241 return matchedSymbol; 242 } 243 matchedSymbol = [self recoverFromMismatchedToken:anInput TokenType:ttype Follow:follow]; 244 return matchedSymbol; 245 } 246 247 -(void) matchAny:(id<ANTLRIntStream>)anInput 248 { 249 state.errorRecovery = NO; 250 state.failed = NO; 251 [anInput consume]; 252 } 253 254 -(BOOL) mismatchIsUnwantedToken:(id<ANTLRIntStream>)anInput TokenType:(NSInteger)ttype 255 { 256 return [anInput LA:2] == ttype; 257 } 258 259 -(BOOL) mismatchIsMissingToken:(id<ANTLRIntStream>)anInput Follow:(ANTLRBitSet *) follow 260 { 261 if ( follow == nil ) { 262 // we have no information about the follow; we can only consume 263 // a single token and hope for the best 264 return NO; 265 } 266 // compute what can follow this grammar element reference 267 if ( [follow member:ANTLRTokenTypeEOR] ) { 268 ANTLRBitSet *viableTokensFollowingThisRule = [self computeContextSensitiveRuleFOLLOW]; 269 follow = [follow or:viableTokensFollowingThisRule]; 270 if ( state._fsp >= 0 ) { // remove EOR if we're not the start symbol 271 [follow remove:(ANTLRTokenTypeEOR)]; 272 } 273 } 274 // if current token is consistent with what could come after set 275 // then we know we're missing a token; error recovery is free to 276 // "insert" the missing token 277 278 //System.out.println("viable tokens="+follow.toString(getTokenNames())); 279 //System.out.println("LT(1)="+((TokenStream)input).LT(1)); 280 281 // BitSet cannot handle negative numbers like -1 (EOF) so I leave EOR 282 // in follow set to indicate that the fall of the start symbol is 283 // in the set (EOF can follow). 284 if ( [follow member:[anInput LA:1]] || [follow member:ANTLRTokenTypeEOR] ) { 285 //System.out.println("LT(1)=="+((TokenStream)input).LT(1)+" is consistent with what follows; inserting..."); 286 return YES; 287 } 288 return NO; 289 } 290 291 /** Report a recognition problem. 292 * 293 * This method sets errorRecovery to indicate the parser is recovering 294 * not parsing. Once in recovery mode, no errors are generated. 295 * To get out of recovery mode, the parser must successfully match 296 * a token (after a resync). So it will go: 297 * 298 * 1. error occurs 299 * 2. enter recovery mode, report error 300 * 3. consume until token found in resynch set 301 * 4. try to resume parsing 302 * 5. next match() will reset errorRecovery mode 303 * 304 * If you override, make sure to update syntaxErrors if you care about that. 305 */ 306 -(void) reportError:(ANTLRRecognitionException *) e 307 { 308 // if we've already reported an error and have not matched a token 309 // yet successfully, don't report any errors. 310 if ( state.errorRecovery ) { 311 //System.err.print("[SPURIOUS] "); 312 return; 313 } 314 state.syntaxErrors++; // don't count spurious 315 state.errorRecovery = YES; 316 317 [self displayRecognitionError:[self getTokenNames] Exception:e]; 318 } 319 320 -(void) displayRecognitionError:(AMutableArray *)theTokNams Exception:(ANTLRRecognitionException *)e 321 { 322 NSString *hdr = [self getErrorHeader:e]; 323 NSString *msg = [self getErrorMessage:e TokenNames:theTokNams]; 324 [self emitErrorMessage:[NSString stringWithFormat:@" %@ %@", hdr, msg]]; 325 } 326 327 /** What error message should be generated for the various 328 * exception types? 329 * 330 * Not very object-oriented code, but I like having all error message 331 * generation within one method rather than spread among all of the 332 * exception classes. This also makes it much easier for the exception 333 * handling because the exception classes do not have to have pointers back 334 * to this object to access utility routines and so on. Also, changing 335 * the message for an exception type would be difficult because you 336 * would have to subclassing exception, but then somehow get ANTLR 337 * to make those kinds of exception objects instead of the default. 338 * This looks weird, but trust me--it makes the most sense in terms 339 * of flexibility. 340 * 341 * For grammar debugging, you will want to override this to add 342 * more information such as the stack frame with 343 * getRuleInvocationStack(e, this.getClass().getName()) and, 344 * for no viable alts, the decision description and state etc... 345 * 346 * Override this to change the message generated for one or more 347 * exception types. 348 */ 349 - (NSString *)getErrorMessage:(ANTLRRecognitionException *)e TokenNames:(AMutableArray *)theTokNams 350 { 351 // NSString *msg = [e getMessage]; 352 NSString *msg; 353 if ( [e isKindOfClass:[ANTLRUnwantedTokenException class]] ) { 354 ANTLRUnwantedTokenException *ute = (ANTLRUnwantedTokenException *)e; 355 NSString *tokenName=@"<unknown>"; 356 if ( ute.expecting == ANTLRTokenTypeEOF ) { 357 tokenName = @"EOF"; 358 } 359 else { 360 tokenName = (NSString *)[theTokNams objectAtIndex:ute.expecting]; 361 } 362 msg = [NSString stringWithFormat:@"extraneous input %@ expecting %@", [self getTokenErrorDisplay:[ute getUnexpectedToken]], 363 tokenName]; 364 } 365 else if ( [e isKindOfClass:[ANTLRMissingTokenException class] ] ) { 366 ANTLRMissingTokenException *mte = (ANTLRMissingTokenException *)e; 367 NSString *tokenName=@"<unknown>"; 368 if ( mte.expecting== ANTLRTokenTypeEOF ) { 369 tokenName = @"EOF"; 370 } 371 else { 372 tokenName = [theTokNams objectAtIndex:mte.expecting]; 373 } 374 msg = [NSString stringWithFormat:@"missing %@ at %@", tokenName, [self getTokenErrorDisplay:(e.token)] ]; 375 } 376 else if ( [e isKindOfClass:[ANTLRMismatchedTokenException class]] ) { 377 ANTLRMismatchedTokenException *mte = (ANTLRMismatchedTokenException *)e; 378 NSString *tokenName=@"<unknown>"; 379 if ( mte.expecting== ANTLRTokenTypeEOF ) { 380 tokenName = @"EOF"; 381 } 382 else { 383 tokenName = [theTokNams objectAtIndex:mte.expecting]; 384 } 385 msg = [NSString stringWithFormat:@"mismatched input %@ expecting %@",[self getTokenErrorDisplay:(e.token)], tokenName]; 386 } 387 else if ( [e isKindOfClass:[ANTLRMismatchedTreeNodeException class]] ) { 388 ANTLRMismatchedTreeNodeException *mtne = (ANTLRMismatchedTreeNodeException *)e; 389 NSString *tokenName=@"<unknown>"; 390 if ( mtne.expecting==ANTLRTokenTypeEOF ) { 391 tokenName = @"EOF"; 392 } 393 else { 394 tokenName = [theTokNams objectAtIndex:mtne.expecting]; 395 } 396 msg = [NSString stringWithFormat:@"mismatched tree node: %@ expecting %@", mtne.node, tokenName]; 397 } 398 else if ( [e isKindOfClass:[ANTLRNoViableAltException class]] ) { 399 //NoViableAltException *nvae = (NoViableAltException *)e; 400 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" 401 // and "(decision="+nvae.decisionNumber+") and 402 // "state "+nvae.stateNumber 403 msg = [NSString stringWithFormat:@"no viable alternative at input %@", [self getTokenErrorDisplay:e.token]]; 404 } 405 else if ( [e isKindOfClass:[ANTLREarlyExitException class]] ) { 406 //ANTLREarlyExitException *eee = (ANTLREarlyExitException *)e; 407 // for development, can add "(decision="+eee.decisionNumber+")" 408 msg =[NSString stringWithFormat: @"required (...)+ loop did not match anything at input ", [self getTokenErrorDisplay:e.token]]; 409 } 410 else if ( [e isKindOfClass:[ANTLRMismatchedSetException class]] ) { 411 ANTLRMismatchedSetException *mse = (ANTLRMismatchedSetException *)e; 412 msg = [NSString stringWithFormat:@"mismatched input %@ expecting set %@", 413 [self getTokenErrorDisplay:(e.token)], 414 mse.expecting]; 415 } 416 #pragma warning NotSet not yet implemented. 417 else if ( [e isKindOfClass:[ANTLRMismatchedNotSetException class] ] ) { 418 ANTLRMismatchedNotSetException *mse = (ANTLRMismatchedNotSetException *)e; 419 msg = [NSString stringWithFormat:@"mismatched input %@ expecting set %@", 420 [self getTokenErrorDisplay:(e.token)], 421 mse.expecting]; 422 } 423 else if ( [e isKindOfClass:[ANTLRFailedPredicateException class]] ) { 424 ANTLRFailedPredicateException *fpe = (ANTLRFailedPredicateException *)e; 425 msg = [NSString stringWithFormat:@"rule %@ failed predicate: { %@ }?", fpe.ruleName, fpe.predicate]; 426 } 427 else { 428 msg = [NSString stringWithFormat:@"Exception= %@\n", e.name]; 429 } 430 return msg; 431 } 432 433 /** Get number of recognition errors (lexer, parser, tree parser). Each 434 * recognizer tracks its own number. So parser and lexer each have 435 * separate count. Does not count the spurious errors found between 436 * an error and next valid token match 437 * 438 * See also reportError() 439 */ 440 - (NSInteger) getNumberOfSyntaxErrors 441 { 442 return state.syntaxErrors; 443 } 444 445 /** What is the error header, normally line/character position information? */ 446 - (NSString *)getErrorHeader:(ANTLRRecognitionException *)e 447 { 448 return [NSString stringWithFormat:@"line %d:%d", e.line, e.charPositionInLine]; 449 } 450 451 /** How should a token be displayed in an error message? The default 452 * is to display just the text, but during development you might 453 * want to have a lot of information spit out. Override in that case 454 * to use t.toString() (which, for CommonToken, dumps everything about 455 * the token). This is better than forcing you to override a method in 456 * your token objects because you don't have to go modify your lexer 457 * so that it creates a new Java type. 458 */ 459 - (NSString *)getTokenErrorDisplay:(id<ANTLRToken>)t 460 { 461 NSString *s = t.text; 462 if ( s == nil ) { 463 if ( t.type == ANTLRTokenTypeEOF ) { 464 s = @"<EOF>"; 465 } 466 else { 467 s = [NSString stringWithFormat:@"<%@>", t.type]; 468 } 469 } 470 s = [s stringByReplacingOccurrencesOfString:@"\n" withString:@"\\\\n"]; 471 s = [s stringByReplacingOccurrencesOfString:@"\r" withString:@"\\\\r"]; 472 s = [s stringByReplacingOccurrencesOfString:@"\t" withString:@"\\\\t"]; 473 return [NSString stringWithFormat:@"\'%@\'", s]; 474 } 475 476 /** Override this method to change where error messages go */ 477 - (void) emitErrorMessage:(NSString *) msg 478 { 479 // System.err.println(msg); 480 NSLog(@"%@", msg); 481 } 482 483 /** Recover from an error found on the input stream. This is 484 * for NoViableAlt and mismatched symbol exceptions. If you enable 485 * single token insertion and deletion, this will usually not 486 * handle mismatched symbol exceptions but there could be a mismatched 487 * token that the match() routine could not recover from. 488 */ 489 - (void)recover:(id<ANTLRIntStream>)anInput Exception:(ANTLRRecognitionException *)re 490 { 491 if ( state.lastErrorIndex == anInput.index ) { 492 // uh oh, another error at same token index; must be a case 493 // where LT(1) is in the recovery token set so nothing is 494 // consumed; consume a single token so at least to prevent 495 // an infinite loop; this is a failsafe. 496 [anInput consume]; 497 } 498 state.lastErrorIndex = anInput.index; 499 ANTLRBitSet *followSet = [self computeErrorRecoverySet]; 500 [self beginResync]; 501 [self consumeUntilFollow:anInput Follow:followSet]; 502 [self endResync]; 503 } 504 505 - (void) beginResync 506 { 507 508 } 509 510 - (void) endResync 511 { 512 513 } 514 515 /* Compute the error recovery set for the current rule. During 516 * rule invocation, the parser pushes the set of tokens that can 517 * follow that rule reference on the stack; this amounts to 518 * computing FIRST of what follows the rule reference in the 519 * enclosing rule. This local follow set only includes tokens 520 * from within the rule; i.e., the FIRST computation done by 521 * ANTLR stops at the end of a rule. 522 * 523 * EXAMPLE 524 * 525 * When you find a "no viable alt exception", the input is not 526 * consistent with any of the alternatives for rule r. The best 527 * thing to do is to consume tokens until you see something that 528 * can legally follow a call to r *or* any rule that called r. 529 * You don't want the exact set of viable next tokens because the 530 * input might just be missing a token--you might consume the 531 * rest of the input looking for one of the missing tokens. 532 * 533 * Consider grammar: 534 * 535 * a : '[' b ']' 536 * | '(' b ')' 537 * ; 538 * b : c '^' INT ; 539 * c : ID 540 * | INT 541 * ; 542 * 543 * At each rule invocation, the set of tokens that could follow 544 * that rule is pushed on a stack. Here are the various "local" 545 * follow sets: 546 * 547 * FOLLOW(b1_in_a) = FIRST(']') = ']' 548 * FOLLOW(b2_in_a) = FIRST(')') = ')' 549 * FOLLOW(c_in_b) = FIRST('^') = '^' 550 * 551 * Upon erroneous input "[]", the call chain is 552 * 553 * a -> b -> c 554 * 555 * and, hence, the follow context stack is: 556 * 557 * depth local follow set after call to rule 558 * 0 <EOF> a (from main()) 559 * 1 ']' b 560 * 3 '^' c 561 * 562 * Notice that ')' is not included, because b would have to have 563 * been called from a different context in rule a for ')' to be 564 * included. 565 * 566 * For error recovery, we cannot consider FOLLOW(c) 567 * (context-sensitive or otherwise). We need the combined set of 568 * all context-sensitive FOLLOW sets--the set of all tokens that 569 * could follow any reference in the call chain. We need to 570 * resync to one of those tokens. Note that FOLLOW(c)='^' and if 571 * we resync'd to that token, we'd consume until EOF. We need to 572 * sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 573 * In this case, for input "[]", LA(1) is in this set so we would 574 * not consume anything and after printing an error rule c would 575 * return normally. It would not find the required '^' though. 576 * At this point, it gets a mismatched token error and throws an 577 * exception (since LA(1) is not in the viable following token 578 * set). The rule exception handler tries to recover, but finds 579 * the same recovery set and doesn't consume anything. Rule b 580 * exits normally returning to rule a. Now it finds the ']' (and 581 * with the successful match exits errorRecovery mode). 582 * 583 * So, you cna see that the parser walks up call chain looking 584 * for the token that was a member of the recovery set. 585 * 586 * Errors are not generated in errorRecovery mode. 587 * 588 * ANTLR's error recovery mechanism is based upon original ideas: 589 * 590 * "Algorithms + Data Structures = Programs" by Niklaus Wirth 591 * 592 * and 593 * 594 * "A note on error recovery in recursive descent parsers": 595 * http://portal.acm.org/citation.cfm?id=947902.947905 596 * 597 * Later, Josef Grosch had some good ideas: 598 * 599 * "Efficient and Comfortable Error Recovery in Recursive Descent 600 * Parsers": 601 * ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 602 * 603 * Like Grosch I implemented local FOLLOW sets that are combined 604 * at run-time upon error to avoid overhead during parsing. 605 */ 606 - (ANTLRBitSet *) computeErrorRecoverySet 607 { 608 return [self combineFollows:NO]; 609 } 610 611 /** Compute the context-sensitive FOLLOW set for current rule. 612 * This is set of token types that can follow a specific rule 613 * reference given a specific call chain. You get the set of 614 * viable tokens that can possibly come next (lookahead depth 1) 615 * given the current call chain. Contrast this with the 616 * definition of plain FOLLOW for rule r: 617 * 618 * FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 619 * 620 * where x in T* and alpha, beta in V*; T is set of terminals and 621 * V is the set of terminals and nonterminals. In other words, 622 * FOLLOW(r) is the set of all tokens that can possibly follow 623 * references to r in *any* sentential form (context). At 624 * runtime, however, we know precisely which context applies as 625 * we have the call chain. We may compute the exact (rather 626 * than covering superset) set of following tokens. 627 * 628 * For example, consider grammar: 629 * 630 * stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 631 * | "return" expr '.' 632 * ; 633 * expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 634 * atom : INT // FOLLOW(atom)=={'+',')',';','.'} 635 * | '(' expr ')' 636 * ; 637 * 638 * The FOLLOW sets are all inclusive whereas context-sensitive 639 * FOLLOW sets are precisely what could follow a rule reference. 640 * For input input "i=(3);", here is the derivation: 641 * 642 * stat => ID '=' expr ';' 643 * => ID '=' atom ('+' atom)* ';' 644 * => ID '=' '(' expr ')' ('+' atom)* ';' 645 * => ID '=' '(' atom ')' ('+' atom)* ';' 646 * => ID '=' '(' INT ')' ('+' atom)* ';' 647 * => ID '=' '(' INT ')' ';' 648 * 649 * At the "3" token, you'd have a call chain of 650 * 651 * stat -> expr -> atom -> expr -> atom 652 * 653 * What can follow that specific nested ref to atom? Exactly ')' 654 * as you can see by looking at the derivation of this specific 655 * input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 656 * 657 * You want the exact viable token set when recovering from a 658 * token mismatch. Upon token mismatch, if LA(1) is member of 659 * the viable next token set, then you know there is most likely 660 * a missing token in the input stream. "Insert" one by just not 661 * throwing an exception. 662 */ 663 - (ANTLRBitSet *)computeContextSensitiveRuleFOLLOW 664 { 665 return [self combineFollows:YES]; 666 } 667 668 // what is exact? it seems to only add sets from above on stack 669 // if EOR is in set i. When it sees a set w/o EOR, it stops adding. 670 // Why would we ever want them all? Maybe no viable alt instead of 671 // mismatched token? 672 - (ANTLRBitSet *)combineFollows:(BOOL) exact 673 { 674 NSInteger top = state._fsp; 675 ANTLRBitSet *followSet = [[ANTLRBitSet newANTLRBitSet] retain]; 676 for (int i = top; i >= 0; i--) { 677 ANTLRBitSet *localFollowSet = (ANTLRBitSet *)[state.following objectAtIndex:i]; 678 /* 679 System.out.println("local follow depth "+i+"="+ 680 localFollowSet.toString(getTokenNames())+")"); 681 */ 682 [followSet orInPlace:localFollowSet]; 683 if ( exact ) { 684 // can we see end of rule? 685 if ( [localFollowSet member:ANTLRTokenTypeEOR] ) { 686 // Only leave EOR in set if at top (start rule); this lets 687 // us know if have to include follow(start rule); i.e., EOF 688 if ( i > 0 ) { 689 [followSet remove:ANTLRTokenTypeEOR]; 690 } 691 } 692 else { // can't see end of rule, quit 693 break; 694 } 695 } 696 } 697 return followSet; 698 } 699 700 /** Attempt to recover from a single missing or extra token. 701 * 702 * EXTRA TOKEN 703 * 704 * LA(1) is not what we are looking for. If LA(2) has the right token, 705 * however, then assume LA(1) is some extra spurious token. Delete it 706 * and LA(2) as if we were doing a normal match(), which advances the 707 * input. 708 * 709 * MISSING TOKEN 710 * 711 * If current token is consistent with what could come after 712 * ttype then it is ok to "insert" the missing token, else throw 713 * exception For example, Input "i=(3;" is clearly missing the 714 * ')'. When the parser returns from the nested call to expr, it 715 * will have call chain: 716 * 717 * stat -> expr -> atom 718 * 719 * and it will be trying to match the ')' at this point in the 720 * derivation: 721 * 722 * => ID '=' '(' INT ')' ('+' atom)* ';' 723 * ^ 724 * match() will see that ';' doesn't match ')' and report a 725 * mismatched token error. To recover, it sees that LA(1)==';' 726 * is in the set of tokens that can follow the ')' token 727 * reference in rule atom. It can assume that you forgot the ')'. 728 */ 729 - (id<ANTLRToken>)recoverFromMismatchedToken:(id<ANTLRIntStream>)anInput 730 TokenType:(NSInteger)ttype 731 Follow:(ANTLRBitSet *)follow 732 { 733 ANTLRRecognitionException *e = nil; 734 // if next token is what we are looking for then "delete" this token 735 if ( [self mismatchIsUnwantedToken:anInput TokenType:ttype] ) { 736 e = [ANTLRUnwantedTokenException newException:ttype Stream:anInput]; 737 /* 738 System.err.println("recoverFromMismatchedToken deleting "+ 739 ((TokenStream)input).LT(1)+ 740 " since "+((TokenStream)input).LT(2)+" is what we want"); 741 */ 742 [self beginResync]; 743 [anInput consume]; // simply delete extra token 744 [self endResync]; 745 [self reportError:e]; // report after consuming so AW sees the token in the exception 746 // we want to return the token we're actually matching 747 id matchedSymbol = [self getCurrentInputSymbol:anInput]; 748 [anInput consume]; // move past ttype token as if all were ok 749 return matchedSymbol; 750 } 751 // can't recover with single token deletion, try insertion 752 if ( [self mismatchIsMissingToken:anInput Follow:follow] ) { 753 id<ANTLRToken> inserted = [self getMissingSymbol:anInput Exception:e TokenType:ttype Follow:follow]; 754 e = [ANTLRMissingTokenException newException:ttype Stream:anInput With:inserted]; 755 [self reportError:e]; // report after inserting so AW sees the token in the exception 756 return inserted; 757 } 758 // even that didn't work; must throw the exception 759 e = [ANTLRMismatchedTokenException newException:ttype Stream:anInput]; 760 @throw e; 761 } 762 763 /** Not currently used */ 764 -(id) recoverFromMismatchedSet:(id<ANTLRIntStream>)anInput 765 Exception:(ANTLRRecognitionException *)e 766 Follow:(ANTLRBitSet *) follow 767 { 768 if ( [self mismatchIsMissingToken:anInput Follow:follow] ) { 769 // System.out.println("missing token"); 770 [self reportError:e]; 771 // we don't know how to conjure up a token for sets yet 772 return [self getMissingSymbol:anInput Exception:e TokenType:ANTLRTokenTypeInvalid Follow:follow]; 773 } 774 // TODO do single token deletion like above for Token mismatch 775 @throw e; 776 } 777 778 /** Match needs to return the current input symbol, which gets put 779 * into the label for the associated token ref; e.g., x=ID. Token 780 * and tree parsers need to return different objects. Rather than test 781 * for input stream type or change the IntStream interface, I use 782 * a simple method to ask the recognizer to tell me what the current 783 * input symbol is. 784 * 785 * This is ignored for lexers. 786 */ 787 - (id) getCurrentInputSymbol:(id<ANTLRIntStream>)anInput 788 { 789 return nil; 790 } 791 792 /** Conjure up a missing token during error recovery. 793 * 794 * The recognizer attempts to recover from single missing 795 * symbols. But, actions might refer to that missing symbol. 796 * For example, x=ID {f($x);}. The action clearly assumes 797 * that there has been an identifier matched previously and that 798 * $x points at that token. If that token is missing, but 799 * the next token in the stream is what we want we assume that 800 * this token is missing and we keep going. Because we 801 * have to return some token to replace the missing token, 802 * we have to conjure one up. This method gives the user control 803 * over the tokens returned for missing tokens. Mostly, 804 * you will want to create something special for identifier 805 * tokens. For literals such as '{' and ',', the default 806 * action in the parser or tree parser works. It simply creates 807 * a CommonToken of the appropriate type. The text will be the token. 808 * If you change what tokens must be created by the lexer, 809 * override this method to create the appropriate tokens. 810 */ 811 - (id)getMissingSymbol:(id<ANTLRIntStream>)anInput 812 Exception:(ANTLRRecognitionException *)e 813 TokenType:(NSInteger)expectedTokenType 814 Follow:(ANTLRBitSet *)follow 815 { 816 return nil; 817 } 818 819 820 -(void) consumeUntilTType:(id<ANTLRIntStream>)anInput TokenType:(NSInteger)tokenType 821 { 822 //System.out.println("consumeUntil "+tokenType); 823 int ttype = [anInput LA:1]; 824 while (ttype != ANTLRTokenTypeEOF && ttype != tokenType) { 825 [anInput consume]; 826 ttype = [anInput LA:1]; 827 } 828 } 829 830 /** Consume tokens until one matches the given token set */ 831 -(void) consumeUntilFollow:(id<ANTLRIntStream>)anInput Follow:(ANTLRBitSet *)set 832 { 833 //System.out.println("consumeUntil("+set.toString(getTokenNames())+")"); 834 int ttype = [anInput LA:1]; 835 while (ttype != ANTLRTokenTypeEOF && ![set member:ttype] ) { 836 //System.out.println("consume during recover LA(1)="+getTokenNames()[input.LA(1)]); 837 [anInput consume]; 838 ttype = [anInput LA:1]; 839 } 840 } 841 842 /** Push a rule's follow set using our own hardcoded stack */ 843 - (void)pushFollow:(ANTLRBitSet *)fset 844 { 845 if ( (state._fsp +1) >= [state.following count] ) { 846 // AMutableArray *f = [AMutableArray arrayWithCapacity:[[state.following] count]*2]; 847 // System.arraycopy(state.following, 0, f, 0, state.following.length); 848 // state.following = f; 849 [state.following addObject:fset]; 850 [fset retain]; 851 state._fsp++; 852 } 853 else { 854 [state.following replaceObjectAtIndex:++state._fsp withObject:fset]; 855 } 856 } 857 858 - (ANTLRBitSet *)popFollow 859 { 860 ANTLRBitSet *fset; 861 862 if ( state._fsp >= 0 && [state.following count] > 0 ) { 863 fset = [state.following objectAtIndex:state._fsp--]; 864 [state.following removeLastObject]; 865 return fset; 866 } 867 else { 868 NSLog( @"Attempted to pop a follow when none exists on the stack\n" ); 869 } 870 return nil; 871 } 872 873 /** Return List<String> of the rules in your parser instance 874 * leading up to a call to this method. You could override if 875 * you want more details such as the file/line info of where 876 * in the parser java code a rule is invoked. 877 * 878 * This is very useful for error messages and for context-sensitive 879 * error recovery. 880 */ 881 - (AMutableArray *)getRuleInvocationStack 882 { 883 NSString *parserClassName = [[self className] retain]; 884 return [self getRuleInvocationStack:[ANTLRRecognitionException newException] Recognizer:parserClassName]; 885 } 886 887 /** A more general version of getRuleInvocationStack where you can 888 * pass in, for example, a RecognitionException to get it's rule 889 * stack trace. This routine is shared with all recognizers, hence, 890 * static. 891 * 892 * TODO: move to a utility class or something; weird having lexer call this 893 */ 894 - (AMutableArray *)getRuleInvocationStack:(ANTLRRecognitionException *)e 895 Recognizer:(NSString *)recognizerClassName 896 { 897 // char *name; 898 AMutableArray *rules = [[AMutableArray arrayWithCapacity:20] retain]; 899 NSArray *stack = [e callStackSymbols]; 900 int i = 0; 901 for (i = [stack count]-1; i >= 0; i--) { 902 NSString *t = [stack objectAtIndex:i]; 903 // NSLog(@"stack %d = %@\n", i, t); 904 if ( [t commonPrefixWithString:@"org.antlr.runtime." options:NSLiteralSearch] ) { 905 // id aClass = objc_getClass( [t UTF8String] ); 906 continue; // skip support code such as this method 907 } 908 if ( [t isEqualTo:NEXT_TOKEN_RULE_NAME] ) { 909 // name = sel_getName(method_getName(method)); 910 // NSString *aMethod = [NSString stringWithFormat:@"%s", name]; 911 continue; 912 } 913 if ( ![t isEqualTo:recognizerClassName] ) { 914 // name = class_getName( [t UTF8String] ); 915 continue; // must not be part of this parser 916 } 917 [rules addObject:t]; 918 } 919 #ifdef DONTUSEYET 920 StackTraceElement[] stack = e.getStackTrace(); 921 int i = 0; 922 for (i=stack.length-1; i>=0; i--) { 923 StackTraceElement t = stack[i]; 924 if ( [t getClassName().startsWith("org.antlr.runtime.") ) { 925 continue; // skip support code such as this method 926 } 927 if ( [[t getMethodName] equals:NEXT_TOKEN_RULE_NAME] ) { 928 continue; 929 } 930 if ( ![[t getClassName] equals:recognizerClassName] ) { 931 continue; // must not be part of this parser 932 } 933 [rules addObject:[t getMethodName]]; 934 } 935 #endif 936 [stack release]; 937 return rules; 938 } 939 940 - (NSInteger) getBacktrackingLevel 941 { 942 return [state getBacktracking]; 943 } 944 945 - (void) setBacktrackingLevel:(NSInteger)level 946 { 947 [state setBacktracking:level]; 948 } 949 950 /** Used to print out token names like ID during debugging and 951 * error reporting. The generated parsers implement a method 952 * that overrides this to point to their String[] tokenNames. 953 */ 954 - (NSArray *)getTokenNames 955 { 956 return tokenNames; 957 } 958 959 /** For debugging and other purposes, might want the grammar name. 960 * Have ANTLR generate an implementation for this method. 961 */ 962 - (NSString *)getGrammarFileName 963 { 964 return grammarFileName; 965 } 966 967 - (NSString *)getSourceName 968 { 969 return nil; 970 } 971 972 /** A convenience method for use most often with template rewrites. 973 * Convert a List<Token> to List<String> 974 */ 975 - (AMutableArray *)toStrings:(AMutableArray *)tokens 976 { 977 if ( tokens == nil ) 978 return nil; 979 AMutableArray *strings = [AMutableArray arrayWithCapacity:[tokens count]]; 980 id object; 981 NSInteger i = 0; 982 for (object in tokens) { 983 [strings addObject:[object text]]; 984 i++; 985 } 986 return strings; 987 } 988 989 /** Given a rule number and a start token index number, return 990 * ANTLR_MEMO_RULE_UNKNOWN if the rule has not parsed input starting from 991 * start index. If this rule has parsed input starting from the 992 * start index before, then return where the rule stopped parsing. 993 * It returns the index of the last token matched by the rule. 994 * 995 * For now we use a hashtable and just the slow Object-based one. 996 * Later, we can make a special one for ints and also one that 997 * tosses out data after we commit past input position i. 998 */ 999 - (NSInteger)getRuleMemoization:(NSInteger)ruleIndex StartIndex:(NSInteger)ruleStartIndex 1000 { 1001 NSNumber *stopIndexI; 1002 ANTLRHashRule *aHashRule; 1003 if ( (aHashRule = [state.ruleMemo objectAtIndex:ruleIndex]) == nil ) { 1004 aHashRule = [ANTLRHashRule newANTLRHashRuleWithLen:17]; 1005 [state.ruleMemo insertObject:aHashRule atIndex:ruleIndex]; 1006 } 1007 stopIndexI = [aHashRule getRuleMemoStopIndex:ruleStartIndex]; 1008 if ( stopIndexI == nil ) { 1009 return ANTLR_MEMO_RULE_UNKNOWN; 1010 } 1011 return [stopIndexI integerValue]; 1012 } 1013 1014 /** Has this rule already parsed input at the current index in the 1015 * input stream? Return the stop token index or MEMO_RULE_UNKNOWN. 1016 * If we attempted but failed to parse properly before, return 1017 * MEMO_RULE_FAILED. 1018 * 1019 * This method has a side-effect: if we have seen this input for 1020 * this rule and successfully parsed before, then seek ahead to 1021 * 1 past the stop token matched for this rule last time. 1022 */ 1023 - (BOOL)alreadyParsedRule:(id<ANTLRIntStream>)anInput RuleIndex:(NSInteger)ruleIndex 1024 { 1025 NSInteger aStopIndex = [self getRuleMemoization:ruleIndex StartIndex:anInput.index]; 1026 if ( aStopIndex == ANTLR_MEMO_RULE_UNKNOWN ) { 1027 // NSLog(@"rule %d not yet encountered\n", ruleIndex); 1028 return NO; 1029 } 1030 if ( aStopIndex == ANTLR_MEMO_RULE_FAILED ) { 1031 if (debug) NSLog(@"rule %d will never succeed\n", ruleIndex); 1032 state.failed = YES; 1033 } 1034 else { 1035 if (debug) NSLog(@"seen rule %d before; skipping ahead to %d failed = %@\n", ruleIndex, aStopIndex+1, state.failed?@"YES":@"NO"); 1036 [anInput seek:(aStopIndex+1)]; // jump to one past stop token 1037 } 1038 return YES; 1039 } 1040 1041 /** Record whether or not this rule parsed the input at this position 1042 * successfully. Use a standard java hashtable for now. 1043 */ 1044 - (void)memoize:(id<ANTLRIntStream>)anInput 1045 RuleIndex:(NSInteger)ruleIndex 1046 StartIndex:(NSInteger)ruleStartIndex 1047 { 1048 ANTLRRuleStack *aRuleStack; 1049 NSInteger stopTokenIndex; 1050 1051 aRuleStack = state.ruleMemo; 1052 stopTokenIndex = (state.failed ? ANTLR_MEMO_RULE_FAILED : (anInput.index-1)); 1053 if ( aRuleStack == nil ) { 1054 if (debug) NSLog(@"!!!!!!!!! memo array is nil for %@", [self getGrammarFileName]); 1055 return; 1056 } 1057 if ( ruleIndex >= [aRuleStack length] ) { 1058 if (debug) NSLog(@"!!!!!!!!! memo size is %d, but rule index is %d", [state.ruleMemo length], ruleIndex); 1059 return; 1060 } 1061 if ( [aRuleStack objectAtIndex:ruleIndex] != nil ) { 1062 [aRuleStack putHashRuleAtRuleIndex:ruleIndex StartIndex:ruleStartIndex StopIndex:stopTokenIndex]; 1063 } 1064 return; 1065 } 1066 1067 /** return how many rule/input-index pairs there are in total. 1068 * TODO: this includes synpreds. :( 1069 */ 1070 - (NSInteger)getRuleMemoizationCacheSize 1071 { 1072 ANTLRRuleStack *aRuleStack; 1073 ANTLRHashRule *aHashRule; 1074 1075 int aCnt = 0; 1076 aRuleStack = state.ruleMemo; 1077 for (NSUInteger i = 0; aRuleStack != nil && i < [aRuleStack length]; i++) { 1078 aHashRule = [aRuleStack objectAtIndex:i]; 1079 if ( aHashRule != nil ) { 1080 aCnt += [aHashRule count]; // how many input indexes are recorded? 1081 } 1082 } 1083 return aCnt; 1084 } 1085 1086 #pragma warning Have to fix traceIn and traceOut. 1087 - (void)traceIn:(NSString *)ruleName Index:(NSInteger)ruleIndex Object:(id)inputSymbol 1088 { 1089 NSLog(@"enter %@ %@", ruleName, inputSymbol); 1090 if ( state.backtracking > 0 ) { 1091 NSLog(@" backtracking=%s", ((state.backtracking==YES)?"YES":"NO")); 1092 } 1093 NSLog(@"\n"); 1094 } 1095 1096 - (void)traceOut:(NSString *)ruleName Index:(NSInteger)ruleIndex Object:(id)inputSymbol 1097 { 1098 NSLog(@"exit %@ -- %@", ruleName, inputSymbol); 1099 if ( state.backtracking > 0 ) { 1100 NSLog(@" backtracking=%s %s", state.backtracking?"YES":"NO", state.failed ? "failed":"succeeded"); 1101 } 1102 NSLog(@"\n"); 1103 } 1104 1105 1106 // call a syntactic predicate methods using its selector. this way we can support arbitrary synpreds. 1107 - (BOOL) evaluateSyntacticPredicate:(SEL)synpredFragment // stream:(id<ANTLRIntStream>)input 1108 { 1109 id<ANTLRIntStream> input; 1110 1111 state.backtracking++; 1112 // input = state.token.input; 1113 input = self.input; 1114 int start = [input mark]; 1115 @try { 1116 [self performSelector:synpredFragment]; 1117 } 1118 @catch (ANTLRRecognitionException *re) { 1119 NSLog(@"impossible synpred: %@", re.name); 1120 } 1121 BOOL success = (state.failed == NO); 1122 [input rewind:start]; 1123 state.backtracking--; 1124 state.failed = NO; 1125 return success; 1126 } 1127 1128 @end 1129 1130