1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2 # Licensed to PSF under a Contributor Agreement. 3 4 """This module defines the data structures used to represent a grammar. 5 6 These are a bit arcane because they are derived from the data 7 structures used by Python's 'pgen' parser generator. 8 9 There's also a table here mapping operators to their names in the 10 token module; the Python tokenize module reports all operators as the 11 fallback token code OP, but the parser needs the actual token code. 12 13 """ 14 15 # Python imports 16 import collections 17 import pickle 18 19 # Local imports 20 from . import token, tokenize 21 22 23 class Grammar(object): 24 """Pgen parsing tables conversion class. 25 26 Once initialized, this class supplies the grammar tables for the 27 parsing engine implemented by parse.py. The parsing engine 28 accesses the instance variables directly. The class here does not 29 provide initialization of the tables; several subclasses exist to 30 do this (see the conv and pgen modules). 31 32 The load() method reads the tables from a pickle file, which is 33 much faster than the other ways offered by subclasses. The pickle 34 file is written by calling dump() (after loading the grammar 35 tables using a subclass). The report() method prints a readable 36 representation of the tables to stdout, for debugging. 37 38 The instance variables are as follows: 39 40 symbol2number -- a dict mapping symbol names to numbers. Symbol 41 numbers are always 256 or higher, to distinguish 42 them from token numbers, which are between 0 and 43 255 (inclusive). 44 45 number2symbol -- a dict mapping numbers to symbol names; 46 these two are each other's inverse. 47 48 states -- a list of DFAs, where each DFA is a list of 49 states, each state is a list of arcs, and each 50 arc is a (i, j) pair where i is a label and j is 51 a state number. The DFA number is the index into 52 this list. (This name is slightly confusing.) 53 Final states are represented by a special arc of 54 the form (0, j) where j is its own state number. 55 56 dfas -- a dict mapping symbol numbers to (DFA, first) 57 pairs, where DFA is an item from the states list 58 above, and first is a set of tokens that can 59 begin this grammar rule (represented by a dict 60 whose values are always 1). 61 62 labels -- a list of (x, y) pairs where x is either a token 63 number or a symbol number, and y is either None 64 or a string; the strings are keywords. The label 65 number is the index in this list; label numbers 66 are used to mark state transitions (arcs) in the 67 DFAs. 68 69 start -- the number of the grammar's start symbol. 70 71 keywords -- a dict mapping keyword strings to arc labels. 72 73 tokens -- a dict mapping token numbers to arc labels. 74 75 """ 76 77 def __init__(self): 78 self.symbol2number = {} 79 self.number2symbol = {} 80 self.states = [] 81 self.dfas = {} 82 self.labels = [(0, "EMPTY")] 83 self.keywords = {} 84 self.tokens = {} 85 self.symbol2label = {} 86 self.start = 256 87 88 def dump(self, filename): 89 """Dump the grammar tables to a pickle file. 90 91 dump() recursively changes all dict to OrderedDict, so the pickled file 92 is not exactly the same as what was passed in to dump(). load() uses the 93 pickled file to create the tables, but only changes OrderedDict to dict 94 at the top level; it does not recursively change OrderedDict to dict. 95 So, the loaded tables are different from the original tables that were 96 passed to load() in that some of the OrderedDict (from the pickled file) 97 are not changed back to dict. For parsing, this has no effect on 98 performance because OrderedDict uses dict's __getitem__ with nothing in 99 between. 100 """ 101 with open(filename, "wb") as f: 102 d = _make_deterministic(self.__dict__) 103 pickle.dump(d, f, 2) 104 105 def load(self, filename): 106 """Load the grammar tables from a pickle file.""" 107 f = open(filename, "rb") 108 d = pickle.load(f) 109 f.close() 110 self.__dict__.update(d) 111 112 def copy(self): 113 """ 114 Copy the grammar. 115 """ 116 new = self.__class__() 117 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 118 "tokens", "symbol2label"): 119 setattr(new, dict_attr, getattr(self, dict_attr).copy()) 120 new.labels = self.labels[:] 121 new.states = self.states[:] 122 new.start = self.start 123 return new 124 125 def report(self): 126 """Dump the grammar tables to standard output, for debugging.""" 127 from pprint import pprint 128 print "s2n" 129 pprint(self.symbol2number) 130 print "n2s" 131 pprint(self.number2symbol) 132 print "states" 133 pprint(self.states) 134 print "dfas" 135 pprint(self.dfas) 136 print "labels" 137 pprint(self.labels) 138 print "start", self.start 139 140 141 def _make_deterministic(top): 142 if isinstance(top, dict): 143 return collections.OrderedDict( 144 sorted(((k, _make_deterministic(v)) for k, v in top.iteritems()))) 145 if isinstance(top, list): 146 return [_make_deterministic(e) for e in top] 147 if isinstance(top, tuple): 148 return tuple(_make_deterministic(e) for e in top) 149 return top 150 151 152 # Map from operator to number (since tokenize doesn't do this) 153 154 opmap_raw = """ 155 ( LPAR 156 ) RPAR 157 [ LSQB 158 ] RSQB 159 : COLON 160 , COMMA 161 ; SEMI 162 + PLUS 163 - MINUS 164 * STAR 165 / SLASH 166 | VBAR 167 & AMPER 168 < LESS 169 > GREATER 170 = EQUAL 171 . DOT 172 % PERCENT 173 ` BACKQUOTE 174 { LBRACE 175 } RBRACE 176 @ AT 177 @= ATEQUAL 178 == EQEQUAL 179 != NOTEQUAL 180 <> NOTEQUAL 181 <= LESSEQUAL 182 >= GREATEREQUAL 183 ~ TILDE 184 ^ CIRCUMFLEX 185 << LEFTSHIFT 186 >> RIGHTSHIFT 187 ** DOUBLESTAR 188 += PLUSEQUAL 189 -= MINEQUAL 190 *= STAREQUAL 191 /= SLASHEQUAL 192 %= PERCENTEQUAL 193 &= AMPEREQUAL 194 |= VBAREQUAL 195 ^= CIRCUMFLEXEQUAL 196 <<= LEFTSHIFTEQUAL 197 >>= RIGHTSHIFTEQUAL 198 **= DOUBLESTAREQUAL 199 // DOUBLESLASH 200 //= DOUBLESLASHEQUAL 201 -> RARROW 202 """ 203 204 opmap = {} 205 for line in opmap_raw.splitlines(): 206 if line: 207 op, name = line.split() 208 opmap[op] = getattr(token, name) 209