Home | History | Annotate | Download | only in pgen2
      1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
      2 # Licensed to PSF under a Contributor Agreement.
      3 
      4 """This module defines the data structures used to represent a grammar.
      5 
      6 These are a bit arcane because they are derived from the data
      7 structures used by Python's 'pgen' parser generator.
      8 
      9 There's also a table here mapping operators to their names in the
     10 token module; the Python tokenize module reports all operators as the
     11 fallback token code OP, but the parser needs the actual token code.
     12 
     13 """
     14 
     15 # Python imports
     16 import collections
     17 import pickle
     18 
     19 # Local imports
     20 from . import token, tokenize
     21 
     22 
     23 class Grammar(object):
     24     """Pgen parsing tables conversion class.
     25 
     26     Once initialized, this class supplies the grammar tables for the
     27     parsing engine implemented by parse.py.  The parsing engine
     28     accesses the instance variables directly.  The class here does not
     29     provide initialization of the tables; several subclasses exist to
     30     do this (see the conv and pgen modules).
     31 
     32     The load() method reads the tables from a pickle file, which is
     33     much faster than the other ways offered by subclasses.  The pickle
     34     file is written by calling dump() (after loading the grammar
     35     tables using a subclass).  The report() method prints a readable
     36     representation of the tables to stdout, for debugging.
     37 
     38     The instance variables are as follows:
     39 
     40     symbol2number -- a dict mapping symbol names to numbers.  Symbol
     41                      numbers are always 256 or higher, to distinguish
     42                      them from token numbers, which are between 0 and
     43                      255 (inclusive).
     44 
     45     number2symbol -- a dict mapping numbers to symbol names;
     46                      these two are each other's inverse.
     47 
     48     states        -- a list of DFAs, where each DFA is a list of
     49                      states, each state is a list of arcs, and each
     50                      arc is a (i, j) pair where i is a label and j is
     51                      a state number.  The DFA number is the index into
     52                      this list.  (This name is slightly confusing.)
     53                      Final states are represented by a special arc of
     54                      the form (0, j) where j is its own state number.
     55 
     56     dfas          -- a dict mapping symbol numbers to (DFA, first)
     57                      pairs, where DFA is an item from the states list
     58                      above, and first is a set of tokens that can
     59                      begin this grammar rule (represented by a dict
     60                      whose values are always 1).
     61 
     62     labels        -- a list of (x, y) pairs where x is either a token
     63                      number or a symbol number, and y is either None
     64                      or a string; the strings are keywords.  The label
     65                      number is the index in this list; label numbers
     66                      are used to mark state transitions (arcs) in the
     67                      DFAs.
     68 
     69     start         -- the number of the grammar's start symbol.
     70 
     71     keywords      -- a dict mapping keyword strings to arc labels.
     72 
     73     tokens        -- a dict mapping token numbers to arc labels.
     74 
     75     """
     76 
     77     def __init__(self):
     78         self.symbol2number = {}
     79         self.number2symbol = {}
     80         self.states = []
     81         self.dfas = {}
     82         self.labels = [(0, "EMPTY")]
     83         self.keywords = {}
     84         self.tokens = {}
     85         self.symbol2label = {}
     86         self.start = 256
     87 
     88     def dump(self, filename):
     89         """Dump the grammar tables to a pickle file.
     90 
     91         dump() recursively changes all dict to OrderedDict, so the pickled file
     92         is not exactly the same as what was passed in to dump(). load() uses the
     93         pickled file to create the tables, but  only changes OrderedDict to dict
     94         at the top level; it does not recursively change OrderedDict to dict.
     95         So, the loaded tables are different from the original tables that were
     96         passed to load() in that some of the OrderedDict (from the pickled file)
     97         are not changed back to dict. For parsing, this has no effect on
     98         performance because OrderedDict uses dict's __getitem__ with nothing in
     99         between.
    100         """
    101         with open(filename, "wb") as f:
    102             d = _make_deterministic(self.__dict__)
    103             pickle.dump(d, f, 2)
    104 
    105     def load(self, filename):
    106         """Load the grammar tables from a pickle file."""
    107         f = open(filename, "rb")
    108         d = pickle.load(f)
    109         f.close()
    110         self.__dict__.update(d)
    111 
    112     def copy(self):
    113         """
    114         Copy the grammar.
    115         """
    116         new = self.__class__()
    117         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
    118                           "tokens", "symbol2label"):
    119             setattr(new, dict_attr, getattr(self, dict_attr).copy())
    120         new.labels = self.labels[:]
    121         new.states = self.states[:]
    122         new.start = self.start
    123         return new
    124 
    125     def report(self):
    126         """Dump the grammar tables to standard output, for debugging."""
    127         from pprint import pprint
    128         print "s2n"
    129         pprint(self.symbol2number)
    130         print "n2s"
    131         pprint(self.number2symbol)
    132         print "states"
    133         pprint(self.states)
    134         print "dfas"
    135         pprint(self.dfas)
    136         print "labels"
    137         pprint(self.labels)
    138         print "start", self.start
    139 
    140 
    141 def _make_deterministic(top):
    142     if isinstance(top, dict):
    143         return collections.OrderedDict(
    144             sorted(((k, _make_deterministic(v)) for k, v in top.iteritems())))
    145     if isinstance(top, list):
    146         return [_make_deterministic(e) for e in top]
    147     if isinstance(top, tuple):
    148         return tuple(_make_deterministic(e) for e in top)
    149     return top
    150 
    151 
    152 # Map from operator to number (since tokenize doesn't do this)
    153 
    154 opmap_raw = """
    155 ( LPAR
    156 ) RPAR
    157 [ LSQB
    158 ] RSQB
    159 : COLON
    160 , COMMA
    161 ; SEMI
    162 + PLUS
    163 - MINUS
    164 * STAR
    165 / SLASH
    166 | VBAR
    167 & AMPER
    168 < LESS
    169 > GREATER
    170 = EQUAL
    171 . DOT
    172 % PERCENT
    173 ` BACKQUOTE
    174 { LBRACE
    175 } RBRACE
    176 @ AT
    177 @= ATEQUAL
    178 == EQEQUAL
    179 != NOTEQUAL
    180 <> NOTEQUAL
    181 <= LESSEQUAL
    182 >= GREATEREQUAL
    183 ~ TILDE
    184 ^ CIRCUMFLEX
    185 << LEFTSHIFT
    186 >> RIGHTSHIFT
    187 ** DOUBLESTAR
    188 += PLUSEQUAL
    189 -= MINEQUAL
    190 *= STAREQUAL
    191 /= SLASHEQUAL
    192 %= PERCENTEQUAL
    193 &= AMPEREQUAL
    194 |= VBAREQUAL
    195 ^= CIRCUMFLEXEQUAL
    196 <<= LEFTSHIFTEQUAL
    197 >>= RIGHTSHIFTEQUAL
    198 **= DOUBLESTAREQUAL
    199 // DOUBLESLASH
    200 //= DOUBLESLASHEQUAL
    201 -> RARROW
    202 """
    203 
    204 opmap = {}
    205 for line in opmap_raw.splitlines():
    206     if line:
    207         op, name = line.split()
    208         opmap[op] = getattr(token, name)
    209