1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2 # Licensed to PSF under a Contributor Agreement. 3 4 # Pgen imports 5 from . import grammar, token, tokenize 6 7 class PgenGrammar(grammar.Grammar): 8 pass 9 10 class ParserGenerator(object): 11 12 def __init__(self, filename, stream=None): 13 close_stream = None 14 if stream is None: 15 stream = open(filename) 16 close_stream = stream.close 17 self.filename = filename 18 self.stream = stream 19 self.generator = tokenize.generate_tokens(stream.readline) 20 self.gettoken() # Initialize lookahead 21 self.dfas, self.startsymbol = self.parse() 22 if close_stream is not None: 23 close_stream() 24 self.first = {} # map from symbol name to set of tokens 25 self.addfirstsets() 26 27 def make_grammar(self): 28 c = PgenGrammar() 29 names = self.dfas.keys() 30 names.sort() 31 names.remove(self.startsymbol) 32 names.insert(0, self.startsymbol) 33 for name in names: 34 i = 256 + len(c.symbol2number) 35 c.symbol2number[name] = i 36 c.number2symbol[i] = name 37 for name in names: 38 dfa = self.dfas[name] 39 states = [] 40 for state in dfa: 41 arcs = [] 42 for label, next in state.arcs.iteritems(): 43 arcs.append((self.make_label(c, label), dfa.index(next))) 44 if state.isfinal: 45 arcs.append((0, dfa.index(state))) 46 states.append(arcs) 47 c.states.append(states) 48 c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) 49 c.start = c.symbol2number[self.startsymbol] 50 return c 51 52 def make_first(self, c, name): 53 rawfirst = self.first[name] 54 first = {} 55 for label in rawfirst: 56 ilabel = self.make_label(c, label) 57 ##assert ilabel not in first # XXX failed on <> ... != 58 first[ilabel] = 1 59 return first 60 61 def make_label(self, c, label): 62 # XXX Maybe this should be a method on a subclass of converter? 63 ilabel = len(c.labels) 64 if label[0].isalpha(): 65 # Either a symbol name or a named token 66 if label in c.symbol2number: 67 # A symbol name (a non-terminal) 68 if label in c.symbol2label: 69 return c.symbol2label[label] 70 else: 71 c.labels.append((c.symbol2number[label], None)) 72 c.symbol2label[label] = ilabel 73 return ilabel 74 else: 75 # A named token (NAME, NUMBER, STRING) 76 itoken = getattr(token, label, None) 77 assert isinstance(itoken, int), label 78 assert itoken in token.tok_name, label 79 if itoken in c.tokens: 80 return c.tokens[itoken] 81 else: 82 c.labels.append((itoken, None)) 83 c.tokens[itoken] = ilabel 84 return ilabel 85 else: 86 # Either a keyword or an operator 87 assert label[0] in ('"', "'"), label 88 value = eval(label) 89 if value[0].isalpha(): 90 # A keyword 91 if value in c.keywords: 92 return c.keywords[value] 93 else: 94 c.labels.append((token.NAME, value)) 95 c.keywords[value] = ilabel 96 return ilabel 97 else: 98 # An operator (any non-numeric token) 99 itoken = grammar.opmap[value] # Fails if unknown token 100 if itoken in c.tokens: 101 return c.tokens[itoken] 102 else: 103 c.labels.append((itoken, None)) 104 c.tokens[itoken] = ilabel 105 return ilabel 106 107 def addfirstsets(self): 108 names = self.dfas.keys() 109 names.sort() 110 for name in names: 111 if name not in self.first: 112 self.calcfirst(name) 113 #print name, self.first[name].keys() 114 115 def calcfirst(self, name): 116 dfa = self.dfas[name] 117 self.first[name] = None # dummy to detect left recursion 118 state = dfa[0] 119 totalset = {} 120 overlapcheck = {} 121 for label, next in state.arcs.iteritems(): 122 if label in self.dfas: 123 if label in self.first: 124 fset = self.first[label] 125 if fset is None: 126 raise ValueError("recursion for rule %r" % name) 127 else: 128 self.calcfirst(label) 129 fset = self.first[label] 130 totalset.update(fset) 131 overlapcheck[label] = fset 132 else: 133 totalset[label] = 1 134 overlapcheck[label] = {label: 1} 135 inverse = {} 136 for label, itsfirst in overlapcheck.iteritems(): 137 for symbol in itsfirst: 138 if symbol in inverse: 139 raise ValueError("rule %s is ambiguous; %s is in the" 140 " first sets of %s as well as %s" % 141 (name, symbol, label, inverse[symbol])) 142 inverse[symbol] = label 143 self.first[name] = totalset 144 145 def parse(self): 146 dfas = {} 147 startsymbol = None 148 # MSTART: (NEWLINE | RULE)* ENDMARKER 149 while self.type != token.ENDMARKER: 150 while self.type == token.NEWLINE: 151 self.gettoken() 152 # RULE: NAME ':' RHS NEWLINE 153 name = self.expect(token.NAME) 154 self.expect(token.OP, ":") 155 a, z = self.parse_rhs() 156 self.expect(token.NEWLINE) 157 #self.dump_nfa(name, a, z) 158 dfa = self.make_dfa(a, z) 159 #self.dump_dfa(name, dfa) 160 oldlen = len(dfa) 161 self.simplify_dfa(dfa) 162 newlen = len(dfa) 163 dfas[name] = dfa 164 #print name, oldlen, newlen 165 if startsymbol is None: 166 startsymbol = name 167 return dfas, startsymbol 168 169 def make_dfa(self, start, finish): 170 # To turn an NFA into a DFA, we define the states of the DFA 171 # to correspond to *sets* of states of the NFA. Then do some 172 # state reduction. Let's represent sets as dicts with 1 for 173 # values. 174 assert isinstance(start, NFAState) 175 assert isinstance(finish, NFAState) 176 def closure(state): 177 base = {} 178 addclosure(state, base) 179 return base 180 def addclosure(state, base): 181 assert isinstance(state, NFAState) 182 if state in base: 183 return 184 base[state] = 1 185 for label, next in state.arcs: 186 if label is None: 187 addclosure(next, base) 188 states = [DFAState(closure(start), finish)] 189 for state in states: # NB states grows while we're iterating 190 arcs = {} 191 for nfastate in state.nfaset: 192 for label, next in nfastate.arcs: 193 if label is not None: 194 addclosure(next, arcs.setdefault(label, {})) 195 for label, nfaset in arcs.iteritems(): 196 for st in states: 197 if st.nfaset == nfaset: 198 break 199 else: 200 st = DFAState(nfaset, finish) 201 states.append(st) 202 state.addarc(st, label) 203 return states # List of DFAState instances; first one is start 204 205 def dump_nfa(self, name, start, finish): 206 print "Dump of NFA for", name 207 todo = [start] 208 for i, state in enumerate(todo): 209 print " State", i, state is finish and "(final)" or "" 210 for label, next in state.arcs: 211 if next in todo: 212 j = todo.index(next) 213 else: 214 j = len(todo) 215 todo.append(next) 216 if label is None: 217 print " -> %d" % j 218 else: 219 print " %s -> %d" % (label, j) 220 221 def dump_dfa(self, name, dfa): 222 print "Dump of DFA for", name 223 for i, state in enumerate(dfa): 224 print " State", i, state.isfinal and "(final)" or "" 225 for label, next in state.arcs.iteritems(): 226 print " %s -> %d" % (label, dfa.index(next)) 227 228 def simplify_dfa(self, dfa): 229 # This is not theoretically optimal, but works well enough. 230 # Algorithm: repeatedly look for two states that have the same 231 # set of arcs (same labels pointing to the same nodes) and 232 # unify them, until things stop changing. 233 234 # dfa is a list of DFAState instances 235 changes = True 236 while changes: 237 changes = False 238 for i, state_i in enumerate(dfa): 239 for j in range(i+1, len(dfa)): 240 state_j = dfa[j] 241 if state_i == state_j: 242 #print " unify", i, j 243 del dfa[j] 244 for state in dfa: 245 state.unifystate(state_j, state_i) 246 changes = True 247 break 248 249 def parse_rhs(self): 250 # RHS: ALT ('|' ALT)* 251 a, z = self.parse_alt() 252 if self.value != "|": 253 return a, z 254 else: 255 aa = NFAState() 256 zz = NFAState() 257 aa.addarc(a) 258 z.addarc(zz) 259 while self.value == "|": 260 self.gettoken() 261 a, z = self.parse_alt() 262 aa.addarc(a) 263 z.addarc(zz) 264 return aa, zz 265 266 def parse_alt(self): 267 # ALT: ITEM+ 268 a, b = self.parse_item() 269 while (self.value in ("(", "[") or 270 self.type in (token.NAME, token.STRING)): 271 c, d = self.parse_item() 272 b.addarc(c) 273 b = d 274 return a, b 275 276 def parse_item(self): 277 # ITEM: '[' RHS ']' | ATOM ['+' | '*'] 278 if self.value == "[": 279 self.gettoken() 280 a, z = self.parse_rhs() 281 self.expect(token.OP, "]") 282 a.addarc(z) 283 return a, z 284 else: 285 a, z = self.parse_atom() 286 value = self.value 287 if value not in ("+", "*"): 288 return a, z 289 self.gettoken() 290 z.addarc(a) 291 if value == "+": 292 return a, z 293 else: 294 return a, a 295 296 def parse_atom(self): 297 # ATOM: '(' RHS ')' | NAME | STRING 298 if self.value == "(": 299 self.gettoken() 300 a, z = self.parse_rhs() 301 self.expect(token.OP, ")") 302 return a, z 303 elif self.type in (token.NAME, token.STRING): 304 a = NFAState() 305 z = NFAState() 306 a.addarc(z, self.value) 307 self.gettoken() 308 return a, z 309 else: 310 self.raise_error("expected (...) or NAME or STRING, got %s/%s", 311 self.type, self.value) 312 313 def expect(self, type, value=None): 314 if self.type != type or (value is not None and self.value != value): 315 self.raise_error("expected %s/%s, got %s/%s", 316 type, value, self.type, self.value) 317 value = self.value 318 self.gettoken() 319 return value 320 321 def gettoken(self): 322 tup = self.generator.next() 323 while tup[0] in (tokenize.COMMENT, tokenize.NL): 324 tup = self.generator.next() 325 self.type, self.value, self.begin, self.end, self.line = tup 326 #print token.tok_name[self.type], repr(self.value) 327 328 def raise_error(self, msg, *args): 329 if args: 330 try: 331 msg = msg % args 332 except: 333 msg = " ".join([msg] + map(str, args)) 334 raise SyntaxError(msg, (self.filename, self.end[0], 335 self.end[1], self.line)) 336 337 class NFAState(object): 338 339 def __init__(self): 340 self.arcs = [] # list of (label, NFAState) pairs 341 342 def addarc(self, next, label=None): 343 assert label is None or isinstance(label, str) 344 assert isinstance(next, NFAState) 345 self.arcs.append((label, next)) 346 347 class DFAState(object): 348 349 def __init__(self, nfaset, final): 350 assert isinstance(nfaset, dict) 351 assert isinstance(iter(nfaset).next(), NFAState) 352 assert isinstance(final, NFAState) 353 self.nfaset = nfaset 354 self.isfinal = final in nfaset 355 self.arcs = {} # map from label to DFAState 356 357 def addarc(self, next, label): 358 assert isinstance(label, str) 359 assert label not in self.arcs 360 assert isinstance(next, DFAState) 361 self.arcs[label] = next 362 363 def unifystate(self, old, new): 364 for label, next in self.arcs.iteritems(): 365 if next is old: 366 self.arcs[label] = new 367 368 def __eq__(self, other): 369 # Equality test -- ignore the nfaset instance variable 370 assert isinstance(other, DFAState) 371 if self.isfinal != other.isfinal: 372 return False 373 # Can't just return self.arcs == other.arcs, because that 374 # would invoke this method recursively, with cycles... 375 if len(self.arcs) != len(other.arcs): 376 return False 377 for label, next in self.arcs.iteritems(): 378 if next is not other.arcs.get(label): 379 return False 380 return True 381 382 __hash__ = None # For Py3 compatibility. 383 384 def generate_grammar(filename="Grammar.txt"): 385 p = ParserGenerator(filename) 386 return p.make_grammar() 387