Home | History | Annotate | Download | only in pgen2
      1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
      2 # Licensed to PSF under a Contributor Agreement.
      3 
      4 """This module defines the data structures used to represent a grammar.
      5 
      6 These are a bit arcane because they are derived from the data
      7 structures used by Python's 'pgen' parser generator.
      8 
      9 There's also a table here mapping operators to their names in the
     10 token module; the Python tokenize module reports all operators as the
     11 fallback token code OP, but the parser needs the actual token code.
     12 
     13 """
     14 
     15 # Python imports
     16 import pickle
     17 
     18 # Local imports
     19 from . import token, tokenize
     20 
     21 
     22 class Grammar(object):
     23     """Pgen parsing tables conversion class.
     24 
     25     Once initialized, this class supplies the grammar tables for the
     26     parsing engine implemented by parse.py.  The parsing engine
     27     accesses the instance variables directly.  The class here does not
     28     provide initialization of the tables; several subclasses exist to
     29     do this (see the conv and pgen modules).
     30 
     31     The load() method reads the tables from a pickle file, which is
     32     much faster than the other ways offered by subclasses.  The pickle
     33     file is written by calling dump() (after loading the grammar
     34     tables using a subclass).  The report() method prints a readable
     35     representation of the tables to stdout, for debugging.
     36 
     37     The instance variables are as follows:
     38 
     39     symbol2number -- a dict mapping symbol names to numbers.  Symbol
     40                      numbers are always 256 or higher, to distinguish
     41                      them from token numbers, which are between 0 and
     42                      255 (inclusive).
     43 
     44     number2symbol -- a dict mapping numbers to symbol names;
     45                      these two are each other's inverse.
     46 
     47     states        -- a list of DFAs, where each DFA is a list of
     48                      states, each state is a list of arcs, and each
     49                      arc is a (i, j) pair where i is a label and j is
     50                      a state number.  The DFA number is the index into
     51                      this list.  (This name is slightly confusing.)
     52                      Final states are represented by a special arc of
     53                      the form (0, j) where j is its own state number.
     54 
     55     dfas          -- a dict mapping symbol numbers to (DFA, first)
     56                      pairs, where DFA is an item from the states list
     57                      above, and first is a set of tokens that can
     58                      begin this grammar rule (represented by a dict
     59                      whose values are always 1).
     60 
     61     labels        -- a list of (x, y) pairs where x is either a token
     62                      number or a symbol number, and y is either None
     63                      or a string; the strings are keywords.  The label
     64                      number is the index in this list; label numbers
     65                      are used to mark state transitions (arcs) in the
     66                      DFAs.
     67 
     68     start         -- the number of the grammar's start symbol.
     69 
     70     keywords      -- a dict mapping keyword strings to arc labels.
     71 
     72     tokens        -- a dict mapping token numbers to arc labels.
     73 
     74     """
     75 
     76     def __init__(self):
     77         self.symbol2number = {}
     78         self.number2symbol = {}
     79         self.states = []
     80         self.dfas = {}
     81         self.labels = [(0, "EMPTY")]
     82         self.keywords = {}
     83         self.tokens = {}
     84         self.symbol2label = {}
     85         self.start = 256
     86 
     87     def dump(self, filename):
     88         """Dump the grammar tables to a pickle file."""
     89         f = open(filename, "wb")
     90         pickle.dump(self.__dict__, f, 2)
     91         f.close()
     92 
     93     def load(self, filename):
     94         """Load the grammar tables from a pickle file."""
     95         f = open(filename, "rb")
     96         d = pickle.load(f)
     97         f.close()
     98         self.__dict__.update(d)
     99 
    100     def copy(self):
    101         """
    102         Copy the grammar.
    103         """
    104         new = self.__class__()
    105         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
    106                           "tokens", "symbol2label"):
    107             setattr(new, dict_attr, getattr(self, dict_attr).copy())
    108         new.labels = self.labels[:]
    109         new.states = self.states[:]
    110         new.start = self.start
    111         return new
    112 
    113     def report(self):
    114         """Dump the grammar tables to standard output, for debugging."""
    115         from pprint import pprint
    116         print "s2n"
    117         pprint(self.symbol2number)
    118         print "n2s"
    119         pprint(self.number2symbol)
    120         print "states"
    121         pprint(self.states)
    122         print "dfas"
    123         pprint(self.dfas)
    124         print "labels"
    125         pprint(self.labels)
    126         print "start", self.start
    127 
    128 
    129 # Map from operator to number (since tokenize doesn't do this)
    130 
    131 opmap_raw = """
    132 ( LPAR
    133 ) RPAR
    134 [ LSQB
    135 ] RSQB
    136 : COLON
    137 , COMMA
    138 ; SEMI
    139 + PLUS
    140 - MINUS
    141 * STAR
    142 / SLASH
    143 | VBAR
    144 & AMPER
    145 < LESS
    146 > GREATER
    147 = EQUAL
    148 . DOT
    149 % PERCENT
    150 ` BACKQUOTE
    151 { LBRACE
    152 } RBRACE
    153 @ AT
    154 == EQEQUAL
    155 != NOTEQUAL
    156 <> NOTEQUAL
    157 <= LESSEQUAL
    158 >= GREATEREQUAL
    159 ~ TILDE
    160 ^ CIRCUMFLEX
    161 << LEFTSHIFT
    162 >> RIGHTSHIFT
    163 ** DOUBLESTAR
    164 += PLUSEQUAL
    165 -= MINEQUAL
    166 *= STAREQUAL
    167 /= SLASHEQUAL
    168 %= PERCENTEQUAL
    169 &= AMPEREQUAL
    170 |= VBAREQUAL
    171 ^= CIRCUMFLEXEQUAL
    172 <<= LEFTSHIFTEQUAL
    173 >>= RIGHTSHIFTEQUAL
    174 **= DOUBLESTAREQUAL
    175 // DOUBLESLASH
    176 //= DOUBLESLASHEQUAL
    177 -> RARROW
    178 """
    179 
    180 opmap = {}
    181 for line in opmap_raw.splitlines():
    182     if line:
    183         op, name = line.split()
    184         opmap[op] = getattr(token, name)
    185