Home | History | Annotate | Download | only in pgen2
      1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
      2 # Licensed to PSF under a Contributor Agreement.
      3 
      4 """Convert graminit.[ch] spit out by pgen to Python code.
      5 
      6 Pgen is the Python parser generator.  It is useful to quickly create a
      7 parser from a grammar file in Python's grammar notation.  But I don't
      8 want my parsers to be written in C (yet), so I'm translating the
      9 parsing tables to Python data structures and writing a Python parse
     10 engine.
     11 
     12 Note that the token numbers are constants determined by the standard
     13 Python tokenizer.  The standard token module defines these numbers and
     14 their names (the names are not used much).  The token numbers are
     15 hardcoded into the Python tokenizer and into pgen.  A Python
     16 implementation of the Python tokenizer is also available, in the
     17 standard tokenize module.
     18 
     19 On the other hand, symbol numbers (representing the grammar's
     20 non-terminals) are assigned by pgen based on the actual grammar
     21 input.
     22 
     23 Note: this module is pretty much obsolete; the pgen module generates
     24 equivalent grammar tables directly from the Grammar.txt input file
     25 without having to invoke the Python pgen C program.
     26 
     27 """
     28 
     29 # Python imports
     30 import re
     31 
     32 # Local imports
     33 from pgen2 import grammar, token
     34 
     35 
     36 class Converter(grammar.Grammar):
     37     """Grammar subclass that reads classic pgen output files.
     38 
     39     The run() method reads the tables as produced by the pgen parser
     40     generator, typically contained in two C files, graminit.h and
     41     graminit.c.  The other methods are for internal use only.
     42 
     43     See the base class for more documentation.
     44 
     45     """
     46 
     47     def run(self, graminit_h, graminit_c):
     48         """Load the grammar tables from the text files written by pgen."""
     49         self.parse_graminit_h(graminit_h)
     50         self.parse_graminit_c(graminit_c)
     51         self.finish_off()
     52 
     53     def parse_graminit_h(self, filename):
     54         """Parse the .h file written by pgen.  (Internal)
     55 
     56         This file is a sequence of #define statements defining the
     57         nonterminals of the grammar as numbers.  We build two tables
     58         mapping the numbers to names and back.
     59 
     60         """
     61         try:
     62             f = open(filename)
     63         except IOError, err:
     64             print "Can't open %s: %s" % (filename, err)
     65             return False
     66         self.symbol2number = {}
     67         self.number2symbol = {}
     68         lineno = 0
     69         for line in f:
     70             lineno += 1
     71             mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
     72             if not mo and line.strip():
     73                 print "%s(%s): can't parse %s" % (filename, lineno,
     74                                                   line.strip())
     75             else:
     76                 symbol, number = mo.groups()
     77                 number = int(number)
     78                 assert symbol not in self.symbol2number
     79                 assert number not in self.number2symbol
     80                 self.symbol2number[symbol] = number
     81                 self.number2symbol[number] = symbol
     82         return True
     83 
     84     def parse_graminit_c(self, filename):
     85         """Parse the .c file written by pgen.  (Internal)
     86 
     87         The file looks as follows.  The first two lines are always this:
     88 
     89         #include "pgenheaders.h"
     90         #include "grammar.h"
     91 
     92         After that come four blocks:
     93 
     94         1) one or more state definitions
     95         2) a table defining dfas
     96         3) a table defining labels
     97         4) a struct defining the grammar
     98 
     99         A state definition has the following form:
    100         - one or more arc arrays, each of the form:
    101           static arc arcs_<n>_<m>[<k>] = {
    102                   {<i>, <j>},
    103                   ...
    104           };
    105         - followed by a state array, of the form:
    106           static state states_<s>[<t>] = {
    107                   {<k>, arcs_<n>_<m>},
    108                   ...
    109           };
    110 
    111         """
    112         try:
    113             f = open(filename)
    114         except IOError, err:
    115             print "Can't open %s: %s" % (filename, err)
    116             return False
    117         # The code below essentially uses f's iterator-ness!
    118         lineno = 0
    119 
    120         # Expect the two #include lines
    121         lineno, line = lineno+1, f.next()
    122         assert line == '#include "pgenheaders.h"\n', (lineno, line)
    123         lineno, line = lineno+1, f.next()
    124         assert line == '#include "grammar.h"\n', (lineno, line)
    125 
    126         # Parse the state definitions
    127         lineno, line = lineno+1, f.next()
    128         allarcs = {}
    129         states = []
    130         while line.startswith("static arc "):
    131             while line.startswith("static arc "):
    132                 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
    133                               line)
    134                 assert mo, (lineno, line)
    135                 n, m, k = map(int, mo.groups())
    136                 arcs = []
    137                 for _ in range(k):
    138                     lineno, line = lineno+1, f.next()
    139                     mo = re.match(r"\s+{(\d+), (\d+)},$", line)
    140                     assert mo, (lineno, line)
    141                     i, j = map(int, mo.groups())
    142                     arcs.append((i, j))
    143                 lineno, line = lineno+1, f.next()
    144                 assert line == "};\n", (lineno, line)
    145                 allarcs[(n, m)] = arcs
    146                 lineno, line = lineno+1, f.next()
    147             mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
    148             assert mo, (lineno, line)
    149             s, t = map(int, mo.groups())
    150             assert s == len(states), (lineno, line)
    151             state = []
    152             for _ in range(t):
    153                 lineno, line = lineno+1, f.next()
    154                 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
    155                 assert mo, (lineno, line)
    156                 k, n, m = map(int, mo.groups())
    157                 arcs = allarcs[n, m]
    158                 assert k == len(arcs), (lineno, line)
    159                 state.append(arcs)
    160             states.append(state)
    161             lineno, line = lineno+1, f.next()
    162             assert line == "};\n", (lineno, line)
    163             lineno, line = lineno+1, f.next()
    164         self.states = states
    165 
    166         # Parse the dfas
    167         dfas = {}
    168         mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
    169         assert mo, (lineno, line)
    170         ndfas = int(mo.group(1))
    171         for i in range(ndfas):
    172             lineno, line = lineno+1, f.next()
    173             mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
    174                           line)
    175             assert mo, (lineno, line)
    176             symbol = mo.group(2)
    177             number, x, y, z = map(int, mo.group(1, 3, 4, 5))
    178             assert self.symbol2number[symbol] == number, (lineno, line)
    179             assert self.number2symbol[number] == symbol, (lineno, line)
    180             assert x == 0, (lineno, line)
    181             state = states[z]
    182             assert y == len(state), (lineno, line)
    183             lineno, line = lineno+1, f.next()
    184             mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
    185             assert mo, (lineno, line)
    186             first = {}
    187             rawbitset = eval(mo.group(1))
    188             for i, c in enumerate(rawbitset):
    189                 byte = ord(c)
    190                 for j in range(8):
    191                     if byte & (1<<j):
    192                         first[i*8 + j] = 1
    193             dfas[number] = (state, first)
    194         lineno, line = lineno+1, f.next()
    195         assert line == "};\n", (lineno, line)
    196         self.dfas = dfas
    197 
    198         # Parse the labels
    199         labels = []
    200         lineno, line = lineno+1, f.next()
    201         mo = re.match(r"static label labels\[(\d+)\] = {$", line)
    202         assert mo, (lineno, line)
    203         nlabels = int(mo.group(1))
    204         for i in range(nlabels):
    205             lineno, line = lineno+1, f.next()
    206             mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
    207             assert mo, (lineno, line)
    208             x, y = mo.groups()
    209             x = int(x)
    210             if y == "0":
    211                 y = None
    212             else:
    213                 y = eval(y)
    214             labels.append((x, y))
    215         lineno, line = lineno+1, f.next()
    216         assert line == "};\n", (lineno, line)
    217         self.labels = labels
    218 
    219         # Parse the grammar struct
    220         lineno, line = lineno+1, f.next()
    221         assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
    222         lineno, line = lineno+1, f.next()
    223         mo = re.match(r"\s+(\d+),$", line)
    224         assert mo, (lineno, line)
    225         ndfas = int(mo.group(1))
    226         assert ndfas == len(self.dfas)
    227         lineno, line = lineno+1, f.next()
    228         assert line == "\tdfas,\n", (lineno, line)
    229         lineno, line = lineno+1, f.next()
    230         mo = re.match(r"\s+{(\d+), labels},$", line)
    231         assert mo, (lineno, line)
    232         nlabels = int(mo.group(1))
    233         assert nlabels == len(self.labels), (lineno, line)
    234         lineno, line = lineno+1, f.next()
    235         mo = re.match(r"\s+(\d+)$", line)
    236         assert mo, (lineno, line)
    237         start = int(mo.group(1))
    238         assert start in self.number2symbol, (lineno, line)
    239         self.start = start
    240         lineno, line = lineno+1, f.next()
    241         assert line == "};\n", (lineno, line)
    242         try:
    243             lineno, line = lineno+1, f.next()
    244         except StopIteration:
    245             pass
    246         else:
    247             assert 0, (lineno, line)
    248 
    249     def finish_off(self):
    250         """Create additional useful structures.  (Internal)."""
    251         self.keywords = {} # map from keyword strings to arc labels
    252         self.tokens = {}   # map from numeric token values to arc labels
    253         for ilabel, (type, value) in enumerate(self.labels):
    254             if type == token.NAME and value is not None:
    255                 self.keywords[value] = ilabel
    256             elif value is None:
    257                 self.tokens[type] = ilabel
    258