1 # ----------------------------------------------------------------------------- 2 # ply: lex.py 3 # 4 # Copyright (C) 2001-2011, 5 # David M. Beazley (Dabeaz LLC) 6 # All rights reserved. 7 # 8 # Redistribution and use in source and binary forms, with or without 9 # modification, are permitted provided that the following conditions are 10 # met: 11 # 12 # * Redistributions of source code must retain the above copyright notice, 13 # this list of conditions and the following disclaimer. 14 # * Redistributions in binary form must reproduce the above copyright notice, 15 # this list of conditions and the following disclaimer in the documentation 16 # and/or other materials provided with the distribution. 17 # * Neither the name of the David Beazley or Dabeaz LLC may be used to 18 # endorse or promote products derived from this software without 19 # specific prior written permission. 20 # 21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 # ----------------------------------------------------------------------------- 33 34 __version__ = "3.4" 35 __tabversion__ = "3.2" # Version of table file used 36 37 import re, sys, types, copy, os 38 39 # This tuple contains known string types 40 try: 41 # Python 2.6 42 StringTypes = (types.StringType, types.UnicodeType) 43 except AttributeError: 44 # Python 3.0 45 StringTypes = (str, bytes) 46 47 # Extract the code attribute of a function. Different implementations 48 # are for Python 2/3 compatibility. 49 50 if sys.version_info[0] < 3: 51 def func_code(f): 52 return f.func_code 53 else: 54 def func_code(f): 55 return f.__code__ 56 57 # This regular expression is used to match valid token names 58 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 59 60 # Exception thrown when invalid token encountered and no default error 61 # handler is defined. 62 63 class LexError(Exception): 64 def __init__(self,message,s): 65 self.args = (message,) 66 self.text = s 67 68 # Token class. This class is used to represent the tokens produced. 69 class LexToken(object): 70 def __str__(self): 71 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) 72 def __repr__(self): 73 return str(self) 74 75 # This object is a stand-in for a logging object created by the 76 # logging module. 77 78 class PlyLogger(object): 79 def __init__(self,f): 80 self.f = f 81 def critical(self,msg,*args,**kwargs): 82 self.f.write((msg % args) + "\n") 83 84 def warning(self,msg,*args,**kwargs): 85 self.f.write("WARNING: "+ (msg % args) + "\n") 86 87 def error(self,msg,*args,**kwargs): 88 self.f.write("ERROR: " + (msg % args) + "\n") 89 90 info = critical 91 debug = critical 92 93 # Null logger is used when no output is generated. Does nothing. 94 class NullLogger(object): 95 def __getattribute__(self,name): 96 return self 97 def __call__(self,*args,**kwargs): 98 return self 99 100 # ----------------------------------------------------------------------------- 101 # === Lexing Engine === 102 # 103 # The following Lexer class implements the lexer runtime. There are only 104 # a few public methods and attributes: 105 # 106 # input() - Store a new string in the lexer 107 # token() - Get the next token 108 # clone() - Clone the lexer 109 # 110 # lineno - Current line number 111 # lexpos - Current position in the input string 112 # ----------------------------------------------------------------------------- 113 114 class Lexer: 115 def __init__(self): 116 self.lexre = None # Master regular expression. This is a list of 117 # tuples (re,findex) where re is a compiled 118 # regular expression and findex is a list 119 # mapping regex group numbers to rules 120 self.lexretext = None # Current regular expression strings 121 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 122 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 123 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 124 self.lexstate = "INITIAL" # Current lexer state 125 self.lexstatestack = [] # Stack of lexer states 126 self.lexstateinfo = None # State information 127 self.lexstateignore = {} # Dictionary of ignored characters for each state 128 self.lexstateerrorf = {} # Dictionary of error functions for each state 129 self.lexreflags = 0 # Optional re compile flags 130 self.lexdata = None # Actual input data (as a string) 131 self.lexpos = 0 # Current position in input text 132 self.lexlen = 0 # Length of the input text 133 self.lexerrorf = None # Error rule (if any) 134 self.lextokens = None # List of valid tokens 135 self.lexignore = "" # Ignored characters 136 self.lexliterals = "" # Literal characters that can be passed through 137 self.lexmodule = None # Module 138 self.lineno = 1 # Current line number 139 self.lexoptimize = 0 # Optimized mode 140 141 def clone(self,object=None): 142 c = copy.copy(self) 143 144 # If the object parameter has been supplied, it means we are attaching the 145 # lexer to a new object. In this case, we have to rebind all methods in 146 # the lexstatere and lexstateerrorf tables. 147 148 if object: 149 newtab = { } 150 for key, ritem in self.lexstatere.items(): 151 newre = [] 152 for cre, findex in ritem: 153 newfindex = [] 154 for f in findex: 155 if not f or not f[0]: 156 newfindex.append(f) 157 continue 158 newfindex.append((getattr(object,f[0].__name__),f[1])) 159 newre.append((cre,newfindex)) 160 newtab[key] = newre 161 c.lexstatere = newtab 162 c.lexstateerrorf = { } 163 for key, ef in self.lexstateerrorf.items(): 164 c.lexstateerrorf[key] = getattr(object,ef.__name__) 165 c.lexmodule = object 166 return c 167 168 # ------------------------------------------------------------ 169 # writetab() - Write lexer information to a table file 170 # ------------------------------------------------------------ 171 def writetab(self,tabfile,outputdir=""): 172 if isinstance(tabfile,types.ModuleType): 173 return 174 basetabfilename = tabfile.split(".")[-1] 175 filename = os.path.join(outputdir,basetabfilename)+".py" 176 tf = open(filename,"w") 177 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 178 tf.write("_tabversion = %s\n" % repr(__version__)) 179 tf.write("_lextokens = %s\n" % repr(self.lextokens)) 180 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 181 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 182 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 183 184 tabre = { } 185 # Collect all functions in the initial state 186 initial = self.lexstatere["INITIAL"] 187 initialfuncs = [] 188 for part in initial: 189 for f in part[1]: 190 if f and f[0]: 191 initialfuncs.append(f) 192 193 for key, lre in self.lexstatere.items(): 194 titem = [] 195 for i in range(len(lre)): 196 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 197 tabre[key] = titem 198 199 tf.write("_lexstatere = %s\n" % repr(tabre)) 200 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 201 202 taberr = { } 203 for key, ef in self.lexstateerrorf.items(): 204 if ef: 205 taberr[key] = ef.__name__ 206 else: 207 taberr[key] = None 208 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 209 tf.close() 210 211 # ------------------------------------------------------------ 212 # readtab() - Read lexer information from a tab file 213 # ------------------------------------------------------------ 214 def readtab(self,tabfile,fdict): 215 if isinstance(tabfile,types.ModuleType): 216 lextab = tabfile 217 else: 218 if sys.version_info[0] < 3: 219 exec("import %s as lextab" % tabfile) 220 else: 221 env = { } 222 exec("import %s as lextab" % tabfile, env,env) 223 lextab = env['lextab'] 224 225 if getattr(lextab,"_tabversion","0.0") != __version__: 226 raise ImportError("Inconsistent PLY version") 227 228 self.lextokens = lextab._lextokens 229 self.lexreflags = lextab._lexreflags 230 self.lexliterals = lextab._lexliterals 231 self.lexstateinfo = lextab._lexstateinfo 232 self.lexstateignore = lextab._lexstateignore 233 self.lexstatere = { } 234 self.lexstateretext = { } 235 for key,lre in lextab._lexstatere.items(): 236 titem = [] 237 txtitem = [] 238 for i in range(len(lre)): 239 titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict))) 240 txtitem.append(lre[i][0]) 241 self.lexstatere[key] = titem 242 self.lexstateretext[key] = txtitem 243 self.lexstateerrorf = { } 244 for key,ef in lextab._lexstateerrorf.items(): 245 self.lexstateerrorf[key] = fdict[ef] 246 self.begin('INITIAL') 247 248 # ------------------------------------------------------------ 249 # input() - Push a new string into the lexer 250 # ------------------------------------------------------------ 251 def input(self,s): 252 # Pull off the first character to see if s looks like a string 253 c = s[:1] 254 if not isinstance(c,StringTypes): 255 raise ValueError("Expected a string") 256 self.lexdata = s 257 self.lexpos = 0 258 self.lexlen = len(s) 259 260 # ------------------------------------------------------------ 261 # begin() - Changes the lexing state 262 # ------------------------------------------------------------ 263 def begin(self,state): 264 if not state in self.lexstatere: 265 raise ValueError("Undefined state") 266 self.lexre = self.lexstatere[state] 267 self.lexretext = self.lexstateretext[state] 268 self.lexignore = self.lexstateignore.get(state,"") 269 self.lexerrorf = self.lexstateerrorf.get(state,None) 270 self.lexstate = state 271 272 # ------------------------------------------------------------ 273 # push_state() - Changes the lexing state and saves old on stack 274 # ------------------------------------------------------------ 275 def push_state(self,state): 276 self.lexstatestack.append(self.lexstate) 277 self.begin(state) 278 279 # ------------------------------------------------------------ 280 # pop_state() - Restores the previous state 281 # ------------------------------------------------------------ 282 def pop_state(self): 283 self.begin(self.lexstatestack.pop()) 284 285 # ------------------------------------------------------------ 286 # current_state() - Returns the current lexing state 287 # ------------------------------------------------------------ 288 def current_state(self): 289 return self.lexstate 290 291 # ------------------------------------------------------------ 292 # skip() - Skip ahead n characters 293 # ------------------------------------------------------------ 294 def skip(self,n): 295 self.lexpos += n 296 297 # ------------------------------------------------------------ 298 # opttoken() - Return the next token from the Lexer 299 # 300 # Note: This function has been carefully implemented to be as fast 301 # as possible. Don't make changes unless you really know what 302 # you are doing 303 # ------------------------------------------------------------ 304 def token(self): 305 # Make local copies of frequently referenced attributes 306 lexpos = self.lexpos 307 lexlen = self.lexlen 308 lexignore = self.lexignore 309 lexdata = self.lexdata 310 311 while lexpos < lexlen: 312 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 313 if lexdata[lexpos] in lexignore: 314 lexpos += 1 315 continue 316 317 # Look for a regular expression match 318 for lexre,lexindexfunc in self.lexre: 319 m = lexre.match(lexdata,lexpos) 320 if not m: continue 321 322 # Create a token for return 323 tok = LexToken() 324 tok.value = m.group() 325 tok.lineno = self.lineno 326 tok.lexpos = lexpos 327 328 i = m.lastindex 329 func,tok.type = lexindexfunc[i] 330 331 if not func: 332 # If no token type was set, it's an ignored token 333 if tok.type: 334 self.lexpos = m.end() 335 return tok 336 else: 337 lexpos = m.end() 338 break 339 340 lexpos = m.end() 341 342 # If token is processed by a function, call it 343 344 tok.lexer = self # Set additional attributes useful in token rules 345 self.lexmatch = m 346 self.lexpos = lexpos 347 348 newtok = func(tok) 349 350 # Every function must return a token, if nothing, we just move to next token 351 if not newtok: 352 lexpos = self.lexpos # This is here in case user has updated lexpos. 353 lexignore = self.lexignore # This is here in case there was a state change 354 break 355 356 # Verify type of the token. If not in the token map, raise an error 357 if not self.lexoptimize: 358 if not newtok.type in self.lextokens: 359 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 360 func_code(func).co_filename, func_code(func).co_firstlineno, 361 func.__name__, newtok.type),lexdata[lexpos:]) 362 363 return newtok 364 else: 365 # No match, see if in literals 366 if lexdata[lexpos] in self.lexliterals: 367 tok = LexToken() 368 tok.value = lexdata[lexpos] 369 tok.lineno = self.lineno 370 tok.type = tok.value 371 tok.lexpos = lexpos 372 self.lexpos = lexpos + 1 373 return tok 374 375 # No match. Call t_error() if defined. 376 if self.lexerrorf: 377 tok = LexToken() 378 tok.value = self.lexdata[lexpos:] 379 tok.lineno = self.lineno 380 tok.type = "error" 381 tok.lexer = self 382 tok.lexpos = lexpos 383 self.lexpos = lexpos 384 newtok = self.lexerrorf(tok) 385 if lexpos == self.lexpos: 386 # Error method didn't change text position at all. This is an error. 387 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 388 lexpos = self.lexpos 389 if not newtok: continue 390 return newtok 391 392 self.lexpos = lexpos 393 raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 394 395 self.lexpos = lexpos + 1 396 if self.lexdata is None: 397 raise RuntimeError("No input string given with input()") 398 return None 399 400 # Iterator interface 401 def __iter__(self): 402 return self 403 404 def next(self): 405 t = self.token() 406 if t is None: 407 raise StopIteration 408 return t 409 410 __next__ = next 411 412 # ----------------------------------------------------------------------------- 413 # ==== Lex Builder === 414 # 415 # The functions and classes below are used to collect lexing information 416 # and build a Lexer object from it. 417 # ----------------------------------------------------------------------------- 418 419 # ----------------------------------------------------------------------------- 420 # get_caller_module_dict() 421 # 422 # This function returns a dictionary containing all of the symbols defined within 423 # a caller further down the call stack. This is used to get the environment 424 # associated with the yacc() call if none was provided. 425 # ----------------------------------------------------------------------------- 426 427 def get_caller_module_dict(levels): 428 try: 429 raise RuntimeError 430 except RuntimeError: 431 e,b,t = sys.exc_info() 432 f = t.tb_frame 433 while levels > 0: 434 f = f.f_back 435 levels -= 1 436 ldict = f.f_globals.copy() 437 if f.f_globals != f.f_locals: 438 ldict.update(f.f_locals) 439 440 return ldict 441 442 # ----------------------------------------------------------------------------- 443 # _funcs_to_names() 444 # 445 # Given a list of regular expression functions, this converts it to a list 446 # suitable for output to a table file 447 # ----------------------------------------------------------------------------- 448 449 def _funcs_to_names(funclist,namelist): 450 result = [] 451 for f,name in zip(funclist,namelist): 452 if f and f[0]: 453 result.append((name, f[1])) 454 else: 455 result.append(f) 456 return result 457 458 # ----------------------------------------------------------------------------- 459 # _names_to_funcs() 460 # 461 # Given a list of regular expression function names, this converts it back to 462 # functions. 463 # ----------------------------------------------------------------------------- 464 465 def _names_to_funcs(namelist,fdict): 466 result = [] 467 for n in namelist: 468 if n and n[0]: 469 result.append((fdict[n[0]],n[1])) 470 else: 471 result.append(n) 472 return result 473 474 # ----------------------------------------------------------------------------- 475 # _form_master_re() 476 # 477 # This function takes a list of all of the regex components and attempts to 478 # form the master regular expression. Given limitations in the Python re 479 # module, it may be necessary to break the master regex into separate expressions. 480 # ----------------------------------------------------------------------------- 481 482 def _form_master_re(relist,reflags,ldict,toknames): 483 if not relist: return [] 484 regex = "|".join(relist) 485 try: 486 lexre = re.compile(regex,re.VERBOSE | reflags) 487 488 # Build the index to function map for the matching engine 489 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 490 lexindexnames = lexindexfunc[:] 491 492 for f,i in lexre.groupindex.items(): 493 handle = ldict.get(f,None) 494 if type(handle) in (types.FunctionType, types.MethodType): 495 lexindexfunc[i] = (handle,toknames[f]) 496 lexindexnames[i] = f 497 elif handle is not None: 498 lexindexnames[i] = f 499 if f.find("ignore_") > 0: 500 lexindexfunc[i] = (None,None) 501 else: 502 lexindexfunc[i] = (None, toknames[f]) 503 504 return [(lexre,lexindexfunc)],[regex],[lexindexnames] 505 except Exception: 506 m = int(len(relist)/2) 507 if m == 0: m = 1 508 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 509 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 510 return llist+rlist, lre+rre, lnames+rnames 511 512 # ----------------------------------------------------------------------------- 513 # def _statetoken(s,names) 514 # 515 # Given a declaration name s of the form "t_" and a dictionary whose keys are 516 # state names, this function returns a tuple (states,tokenname) where states 517 # is a tuple of state names and tokenname is the name of the token. For example, 518 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 519 # ----------------------------------------------------------------------------- 520 521 def _statetoken(s,names): 522 nonstate = 1 523 parts = s.split("_") 524 for i in range(1,len(parts)): 525 if not parts[i] in names and parts[i] != 'ANY': break 526 if i > 1: 527 states = tuple(parts[1:i]) 528 else: 529 states = ('INITIAL',) 530 531 if 'ANY' in states: 532 states = tuple(names) 533 534 tokenname = "_".join(parts[i:]) 535 return (states,tokenname) 536 537 538 # ----------------------------------------------------------------------------- 539 # LexerReflect() 540 # 541 # This class represents information needed to build a lexer as extracted from a 542 # user's input file. 543 # ----------------------------------------------------------------------------- 544 class LexerReflect(object): 545 def __init__(self,ldict,log=None,reflags=0): 546 self.ldict = ldict 547 self.error_func = None 548 self.tokens = [] 549 self.reflags = reflags 550 self.stateinfo = { 'INITIAL' : 'inclusive'} 551 self.files = {} 552 self.error = 0 553 554 if log is None: 555 self.log = PlyLogger(sys.stderr) 556 else: 557 self.log = log 558 559 # Get all of the basic information 560 def get_all(self): 561 self.get_tokens() 562 self.get_literals() 563 self.get_states() 564 self.get_rules() 565 566 # Validate all of the information 567 def validate_all(self): 568 self.validate_tokens() 569 self.validate_literals() 570 self.validate_rules() 571 return self.error 572 573 # Get the tokens map 574 def get_tokens(self): 575 tokens = self.ldict.get("tokens",None) 576 if not tokens: 577 self.log.error("No token list is defined") 578 self.error = 1 579 return 580 581 if not isinstance(tokens,(list, tuple)): 582 self.log.error("tokens must be a list or tuple") 583 self.error = 1 584 return 585 586 if not tokens: 587 self.log.error("tokens is empty") 588 self.error = 1 589 return 590 591 self.tokens = tokens 592 593 # Validate the tokens 594 def validate_tokens(self): 595 terminals = {} 596 for n in self.tokens: 597 if not _is_identifier.match(n): 598 self.log.error("Bad token name '%s'",n) 599 self.error = 1 600 if n in terminals: 601 self.log.warning("Token '%s' multiply defined", n) 602 terminals[n] = 1 603 604 # Get the literals specifier 605 def get_literals(self): 606 self.literals = self.ldict.get("literals","") 607 608 # Validate literals 609 def validate_literals(self): 610 try: 611 for c in self.literals: 612 if not isinstance(c,StringTypes) or len(c) > 1: 613 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 614 self.error = 1 615 continue 616 617 except TypeError: 618 self.log.error("Invalid literals specification. literals must be a sequence of characters") 619 self.error = 1 620 621 def get_states(self): 622 self.states = self.ldict.get("states",None) 623 # Build statemap 624 if self.states: 625 if not isinstance(self.states,(tuple,list)): 626 self.log.error("states must be defined as a tuple or list") 627 self.error = 1 628 else: 629 for s in self.states: 630 if not isinstance(s,tuple) or len(s) != 2: 631 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 632 self.error = 1 633 continue 634 name, statetype = s 635 if not isinstance(name,StringTypes): 636 self.log.error("State name %s must be a string", repr(name)) 637 self.error = 1 638 continue 639 if not (statetype == 'inclusive' or statetype == 'exclusive'): 640 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 641 self.error = 1 642 continue 643 if name in self.stateinfo: 644 self.log.error("State '%s' already defined",name) 645 self.error = 1 646 continue 647 self.stateinfo[name] = statetype 648 649 # Get all of the symbols with a t_ prefix and sort them into various 650 # categories (functions, strings, error functions, and ignore characters) 651 652 def get_rules(self): 653 tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 654 655 # Now build up a list of functions and a list of strings 656 657 self.toknames = { } # Mapping of symbols to token names 658 self.funcsym = { } # Symbols defined as functions 659 self.strsym = { } # Symbols defined as strings 660 self.ignore = { } # Ignore strings by state 661 self.errorf = { } # Error functions by state 662 663 for s in self.stateinfo: 664 self.funcsym[s] = [] 665 self.strsym[s] = [] 666 667 if len(tsymbols) == 0: 668 self.log.error("No rules of the form t_rulename are defined") 669 self.error = 1 670 return 671 672 for f in tsymbols: 673 t = self.ldict[f] 674 states, tokname = _statetoken(f,self.stateinfo) 675 self.toknames[f] = tokname 676 677 if hasattr(t,"__call__"): 678 if tokname == 'error': 679 for s in states: 680 self.errorf[s] = t 681 elif tokname == 'ignore': 682 line = func_code(t).co_firstlineno 683 file = func_code(t).co_filename 684 self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 685 self.error = 1 686 else: 687 for s in states: 688 self.funcsym[s].append((f,t)) 689 elif isinstance(t, StringTypes): 690 if tokname == 'ignore': 691 for s in states: 692 self.ignore[s] = t 693 if "\\" in t: 694 self.log.warning("%s contains a literal backslash '\\'",f) 695 696 elif tokname == 'error': 697 self.log.error("Rule '%s' must be defined as a function", f) 698 self.error = 1 699 else: 700 for s in states: 701 self.strsym[s].append((f,t)) 702 else: 703 self.log.error("%s not defined as a function or string", f) 704 self.error = 1 705 706 # Sort the functions by line number 707 for f in self.funcsym.values(): 708 if sys.version_info[0] < 3: 709 f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno)) 710 else: 711 # Python 3.0 712 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 713 714 # Sort the strings by regular expression length 715 for s in self.strsym.values(): 716 if sys.version_info[0] < 3: 717 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 718 else: 719 # Python 3.0 720 s.sort(key=lambda x: len(x[1]),reverse=True) 721 722 # Validate all of the t_rules collected 723 def validate_rules(self): 724 for state in self.stateinfo: 725 # Validate all rules defined by functions 726 727 728 729 for fname, f in self.funcsym[state]: 730 line = func_code(f).co_firstlineno 731 file = func_code(f).co_filename 732 self.files[file] = 1 733 734 tokname = self.toknames[fname] 735 if isinstance(f, types.MethodType): 736 reqargs = 2 737 else: 738 reqargs = 1 739 nargs = func_code(f).co_argcount 740 if nargs > reqargs: 741 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 742 self.error = 1 743 continue 744 745 if nargs < reqargs: 746 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 747 self.error = 1 748 continue 749 750 if not f.__doc__: 751 self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 752 self.error = 1 753 continue 754 755 try: 756 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 757 if c.match(""): 758 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 759 self.error = 1 760 except re.error: 761 _etype, e, _etrace = sys.exc_info() 762 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 763 if '#' in f.__doc__: 764 self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 765 self.error = 1 766 767 # Validate all rules defined by strings 768 for name,r in self.strsym[state]: 769 tokname = self.toknames[name] 770 if tokname == 'error': 771 self.log.error("Rule '%s' must be defined as a function", name) 772 self.error = 1 773 continue 774 775 if not tokname in self.tokens and tokname.find("ignore_") < 0: 776 self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 777 self.error = 1 778 continue 779 780 try: 781 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 782 if (c.match("")): 783 self.log.error("Regular expression for rule '%s' matches empty string",name) 784 self.error = 1 785 except re.error: 786 _etype, e, _etrace = sys.exc_info() 787 self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 788 if '#' in r: 789 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 790 self.error = 1 791 792 if not self.funcsym[state] and not self.strsym[state]: 793 self.log.error("No rules defined for state '%s'",state) 794 self.error = 1 795 796 # Validate the error function 797 efunc = self.errorf.get(state,None) 798 if efunc: 799 f = efunc 800 line = func_code(f).co_firstlineno 801 file = func_code(f).co_filename 802 self.files[file] = 1 803 804 if isinstance(f, types.MethodType): 805 reqargs = 2 806 else: 807 reqargs = 1 808 nargs = func_code(f).co_argcount 809 if nargs > reqargs: 810 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 811 self.error = 1 812 813 if nargs < reqargs: 814 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 815 self.error = 1 816 817 for f in self.files: 818 self.validate_file(f) 819 820 821 # ----------------------------------------------------------------------------- 822 # validate_file() 823 # 824 # This checks to see if there are duplicated t_rulename() functions or strings 825 # in the parser input file. This is done using a simple regular expression 826 # match on each line in the given file. 827 # ----------------------------------------------------------------------------- 828 829 def validate_file(self,filename): 830 import os.path 831 base,ext = os.path.splitext(filename) 832 if ext != '.py': return # No idea what the file is. Return OK 833 834 try: 835 f = open(filename) 836 lines = f.readlines() 837 f.close() 838 except IOError: 839 return # Couldn't find the file. Don't worry about it 840 841 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 842 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 843 844 counthash = { } 845 linen = 1 846 for l in lines: 847 m = fre.match(l) 848 if not m: 849 m = sre.match(l) 850 if m: 851 name = m.group(1) 852 prev = counthash.get(name) 853 if not prev: 854 counthash[name] = linen 855 else: 856 self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 857 self.error = 1 858 linen += 1 859 860 # ----------------------------------------------------------------------------- 861 # lex(module) 862 # 863 # Build all of the regular expression rules from definitions in the supplied module 864 # ----------------------------------------------------------------------------- 865 def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): 866 global lexer 867 ldict = None 868 stateinfo = { 'INITIAL' : 'inclusive'} 869 lexobj = Lexer() 870 lexobj.lexoptimize = optimize 871 global token,input 872 873 if errorlog is None: 874 errorlog = PlyLogger(sys.stderr) 875 876 if debug: 877 if debuglog is None: 878 debuglog = PlyLogger(sys.stderr) 879 880 # Get the module dictionary used for the lexer 881 if object: module = object 882 883 if module: 884 _items = [(k,getattr(module,k)) for k in dir(module)] 885 ldict = dict(_items) 886 else: 887 ldict = get_caller_module_dict(2) 888 889 # Collect parser information from the dictionary 890 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 891 linfo.get_all() 892 if not optimize: 893 if linfo.validate_all(): 894 raise SyntaxError("Can't build lexer") 895 896 if optimize and lextab: 897 try: 898 lexobj.readtab(lextab,ldict) 899 token = lexobj.token 900 input = lexobj.input 901 lexer = lexobj 902 return lexobj 903 904 except ImportError: 905 pass 906 907 # Dump some basic debugging information 908 if debug: 909 debuglog.info("lex: tokens = %r", linfo.tokens) 910 debuglog.info("lex: literals = %r", linfo.literals) 911 debuglog.info("lex: states = %r", linfo.stateinfo) 912 913 # Build a dictionary of valid token names 914 lexobj.lextokens = { } 915 for n in linfo.tokens: 916 lexobj.lextokens[n] = 1 917 918 # Get literals specification 919 if isinstance(linfo.literals,(list,tuple)): 920 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 921 else: 922 lexobj.lexliterals = linfo.literals 923 924 # Get the stateinfo dictionary 925 stateinfo = linfo.stateinfo 926 927 regexs = { } 928 # Build the master regular expressions 929 for state in stateinfo: 930 regex_list = [] 931 932 # Add rules defined by functions first 933 for fname, f in linfo.funcsym[state]: 934 line = func_code(f).co_firstlineno 935 file = func_code(f).co_filename 936 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 937 if debug: 938 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 939 940 # Now add all of the simple rules 941 for name,r in linfo.strsym[state]: 942 regex_list.append("(?P<%s>%s)" % (name,r)) 943 if debug: 944 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 945 946 regexs[state] = regex_list 947 948 # Build the master regular expressions 949 950 if debug: 951 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 952 953 for state in regexs: 954 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 955 lexobj.lexstatere[state] = lexre 956 lexobj.lexstateretext[state] = re_text 957 lexobj.lexstaterenames[state] = re_names 958 if debug: 959 for i in range(len(re_text)): 960 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 961 962 # For inclusive states, we need to add the regular expressions from the INITIAL state 963 for state,stype in stateinfo.items(): 964 if state != "INITIAL" and stype == 'inclusive': 965 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 966 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 967 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 968 969 lexobj.lexstateinfo = stateinfo 970 lexobj.lexre = lexobj.lexstatere["INITIAL"] 971 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 972 lexobj.lexreflags = reflags 973 974 # Set up ignore variables 975 lexobj.lexstateignore = linfo.ignore 976 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 977 978 # Set up error functions 979 lexobj.lexstateerrorf = linfo.errorf 980 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 981 if not lexobj.lexerrorf: 982 errorlog.warning("No t_error rule is defined") 983 984 # Check state information for ignore and error rules 985 for s,stype in stateinfo.items(): 986 if stype == 'exclusive': 987 if not s in linfo.errorf: 988 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 989 if not s in linfo.ignore and lexobj.lexignore: 990 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 991 elif stype == 'inclusive': 992 if not s in linfo.errorf: 993 linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 994 if not s in linfo.ignore: 995 linfo.ignore[s] = linfo.ignore.get("INITIAL","") 996 997 # Create global versions of the token() and input() functions 998 token = lexobj.token 999 input = lexobj.input 1000 lexer = lexobj 1001 1002 # If in optimize mode, we write the lextab 1003 if lextab and optimize: 1004 lexobj.writetab(lextab,outputdir) 1005 1006 return lexobj 1007 1008 # ----------------------------------------------------------------------------- 1009 # runmain() 1010 # 1011 # This runs the lexer as a main program 1012 # ----------------------------------------------------------------------------- 1013 1014 def runmain(lexer=None,data=None): 1015 if not data: 1016 try: 1017 filename = sys.argv[1] 1018 f = open(filename) 1019 data = f.read() 1020 f.close() 1021 except IndexError: 1022 sys.stdout.write("Reading from standard input (type EOF to end):\n") 1023 data = sys.stdin.read() 1024 1025 if lexer: 1026 _input = lexer.input 1027 else: 1028 _input = input 1029 _input(data) 1030 if lexer: 1031 _token = lexer.token 1032 else: 1033 _token = token 1034 1035 while 1: 1036 tok = _token() 1037 if not tok: break 1038 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) 1039 1040 # ----------------------------------------------------------------------------- 1041 # @TOKEN(regex) 1042 # 1043 # This decorator function can be used to set the regex expression on a function 1044 # when its docstring might need to be set in an alternative way 1045 # ----------------------------------------------------------------------------- 1046 1047 def TOKEN(r): 1048 def set_doc(f): 1049 if hasattr(r,"__call__"): 1050 f.__doc__ = r.__doc__ 1051 else: 1052 f.__doc__ = r 1053 return f 1054 return set_doc 1055 1056 # Alternative spelling of the TOKEN decorator 1057 Token = TOKEN 1058 1059