Home | History | Annotate | Download | only in Lib
      1 """Parse a Python module and describe its classes and functions.
      2 
      3 Parse enough of a Python file to recognize imports and class and
      4 function definitions, and to find out the superclasses of a class.
      5 
      6 The interface consists of a single function:
      7     readmodule_ex(module, path=None)
      8 where module is the name of a Python module, and path is an optional
      9 list of directories where the module is to be searched.  If present,
     10 path is prepended to the system search path sys.path.  The return value
     11 is a dictionary.  The keys of the dictionary are the names of the
     12 classes and functions defined in the module (including classes that are
     13 defined via the from XXX import YYY construct).  The values are
     14 instances of classes Class and Function.  One special key/value pair is
     15 present for packages: the key '__path__' has a list as its value which
     16 contains the package search path.
     17 
     18 Classes and Functions have a common superclass: _Object.  Every instance
     19 has the following attributes:
     20     module  -- name of the module;
     21     name    -- name of the object;
     22     file    -- file in which the object is defined;
     23     lineno  -- line in the file where the object's definition starts;
     24     parent  -- parent of this object, if any;
     25     children -- nested objects contained in this object.
     26 The 'children' attribute is a dictionary mapping names to objects.
     27 
     28 Instances of Function describe functions with the attributes from _Object.
     29 
     30 Instances of Class describe classes with the attributes from _Object,
     31 plus the following:
     32     super   -- list of super classes (Class instances if possible);
     33     methods -- mapping of method names to beginning line numbers.
     34 If the name of a super class is not recognized, the corresponding
     35 entry in the list of super classes is not a class instance but a
     36 string giving the name of the super class.  Since import statements
     37 are recognized and imported modules are scanned as well, this
     38 shouldn't happen often.
     39 """
     40 
     41 import io
     42 import sys
     43 import importlib.util
     44 import tokenize
     45 from token import NAME, DEDENT, OP
     46 
     47 __all__ = ["readmodule", "readmodule_ex", "Class", "Function"]
     48 
     49 _modules = {}  # Initialize cache of modules we've seen.
     50 
     51 
     52 class _Object:
     53     "Informaton about Python class or function."
     54     def __init__(self, module, name, file, lineno, parent):
     55         self.module = module
     56         self.name = name
     57         self.file = file
     58         self.lineno = lineno
     59         self.parent = parent
     60         self.children = {}
     61 
     62     def _addchild(self, name, obj):
     63         self.children[name] = obj
     64 
     65 
     66 class Function(_Object):
     67     "Information about a Python function, including methods."
     68     def __init__(self, module, name, file, lineno, parent=None):
     69         _Object.__init__(self, module, name, file, lineno, parent)
     70 
     71 
     72 class Class(_Object):
     73     "Information about a Python class."
     74     def __init__(self, module, name, super, file, lineno, parent=None):
     75         _Object.__init__(self, module, name, file, lineno, parent)
     76         self.super = [] if super is None else super
     77         self.methods = {}
     78 
     79     def _addmethod(self, name, lineno):
     80         self.methods[name] = lineno
     81 
     82 
     83 def _nest_function(ob, func_name, lineno):
     84     "Return a Function after nesting within ob."
     85     newfunc = Function(ob.module, func_name, ob.file, lineno, ob)
     86     ob._addchild(func_name, newfunc)
     87     if isinstance(ob, Class):
     88         ob._addmethod(func_name, lineno)
     89     return newfunc
     90 
     91 def _nest_class(ob, class_name, lineno, super=None):
     92     "Return a Class after nesting within ob."
     93     newclass = Class(ob.module, class_name, super, ob.file, lineno, ob)
     94     ob._addchild(class_name, newclass)
     95     return newclass
     96 
     97 def readmodule(module, path=None):
     98     """Return Class objects for the top-level classes in module.
     99 
    100     This is the original interface, before Functions were added.
    101     """
    102 
    103     res = {}
    104     for key, value in _readmodule(module, path or []).items():
    105         if isinstance(value, Class):
    106             res[key] = value
    107     return res
    108 
    109 def readmodule_ex(module, path=None):
    110     """Return a dictionary with all functions and classes in module.
    111 
    112     Search for module in PATH + sys.path.
    113     If possible, include imported superclasses.
    114     Do this by reading source, without importing (and executing) it.
    115     """
    116     return _readmodule(module, path or [])
    117 
    118 def _readmodule(module, path, inpackage=None):
    119     """Do the hard work for readmodule[_ex].
    120 
    121     If inpackage is given, it must be the dotted name of the package in
    122     which we are searching for a submodule, and then PATH must be the
    123     package search path; otherwise, we are searching for a top-level
    124     module, and path is combined with sys.path.
    125     """
    126     # Compute the full module name (prepending inpackage if set).
    127     if inpackage is not None:
    128         fullmodule = "%s.%s" % (inpackage, module)
    129     else:
    130         fullmodule = module
    131 
    132     # Check in the cache.
    133     if fullmodule in _modules:
    134         return _modules[fullmodule]
    135 
    136     # Initialize the dict for this module's contents.
    137     tree = {}
    138 
    139     # Check if it is a built-in module; we don't do much for these.
    140     if module in sys.builtin_module_names and inpackage is None:
    141         _modules[module] = tree
    142         return tree
    143 
    144     # Check for a dotted module name.
    145     i = module.rfind('.')
    146     if i >= 0:
    147         package = module[:i]
    148         submodule = module[i+1:]
    149         parent = _readmodule(package, path, inpackage)
    150         if inpackage is not None:
    151             package = "%s.%s" % (inpackage, package)
    152         if not '__path__' in parent:
    153             raise ImportError('No package named {}'.format(package))
    154         return _readmodule(submodule, parent['__path__'], package)
    155 
    156     # Search the path for the module.
    157     f = None
    158     if inpackage is not None:
    159         search_path = path
    160     else:
    161         search_path = path + sys.path
    162     spec = importlib.util._find_spec_from_path(fullmodule, search_path)
    163     _modules[fullmodule] = tree
    164     # Is module a package?
    165     if spec.submodule_search_locations is not None:
    166         tree['__path__'] = spec.submodule_search_locations
    167     try:
    168         source = spec.loader.get_source(fullmodule)
    169         if source is None:
    170             return tree
    171     except (AttributeError, ImportError):
    172         # If module is not Python source, we cannot do anything.
    173         return tree
    174 
    175     fname = spec.loader.get_filename(fullmodule)
    176     return _create_tree(fullmodule, path, fname, source, tree, inpackage)
    177 
    178 
    179 def _create_tree(fullmodule, path, fname, source, tree, inpackage):
    180     """Return the tree for a particular module.
    181 
    182     fullmodule (full module name), inpackage+module, becomes o.module.
    183     path is passed to recursive calls of _readmodule.
    184     fname becomes o.file.
    185     source is tokenized.  Imports cause recursive calls to _readmodule.
    186     tree is {} or {'__path__': <submodule search locations>}.
    187     inpackage, None or string, is passed to recursive calls of _readmodule.
    188 
    189     The effect of recursive calls is mutation of global _modules.
    190     """
    191     f = io.StringIO(source)
    192 
    193     stack = [] # Initialize stack of (class, indent) pairs.
    194 
    195     g = tokenize.generate_tokens(f.readline)
    196     try:
    197         for tokentype, token, start, _end, _line in g:
    198             if tokentype == DEDENT:
    199                 lineno, thisindent = start
    200                 # Close previous nested classes and defs.
    201                 while stack and stack[-1][1] >= thisindent:
    202                     del stack[-1]
    203             elif token == 'def':
    204                 lineno, thisindent = start
    205                 # Close previous nested classes and defs.
    206                 while stack and stack[-1][1] >= thisindent:
    207                     del stack[-1]
    208                 tokentype, func_name, start = next(g)[0:3]
    209                 if tokentype != NAME:
    210                     continue  # Skip def with syntax error.
    211                 cur_func = None
    212                 if stack:
    213                     cur_obj = stack[-1][0]
    214                     cur_func = _nest_function(cur_obj, func_name, lineno)
    215                 else:
    216                     # It is just a function.
    217                     cur_func = Function(fullmodule, func_name, fname, lineno)
    218                     tree[func_name] = cur_func
    219                 stack.append((cur_func, thisindent))
    220             elif token == 'class':
    221                 lineno, thisindent = start
    222                 # Close previous nested classes and defs.
    223                 while stack and stack[-1][1] >= thisindent:
    224                     del stack[-1]
    225                 tokentype, class_name, start = next(g)[0:3]
    226                 if tokentype != NAME:
    227                     continue # Skip class with syntax error.
    228                 # Parse what follows the class name.
    229                 tokentype, token, start = next(g)[0:3]
    230                 inherit = None
    231                 if token == '(':
    232                     names = [] # Initialize list of superclasses.
    233                     level = 1
    234                     super = [] # Tokens making up current superclass.
    235                     while True:
    236                         tokentype, token, start = next(g)[0:3]
    237                         if token in (')', ',') and level == 1:
    238                             n = "".join(super)
    239                             if n in tree:
    240                                 # We know this super class.
    241                                 n = tree[n]
    242                             else:
    243                                 c = n.split('.')
    244                                 if len(c) > 1:
    245                                     # Super class form is module.class:
    246                                     # look in module for class.
    247                                     m = c[-2]
    248                                     c = c[-1]
    249                                     if m in _modules:
    250                                         d = _modules[m]
    251                                         if c in d:
    252                                             n = d[c]
    253                             names.append(n)
    254                             super = []
    255                         if token == '(':
    256                             level += 1
    257                         elif token == ')':
    258                             level -= 1
    259                             if level == 0:
    260                                 break
    261                         elif token == ',' and level == 1:
    262                             pass
    263                         # Only use NAME and OP (== dot) tokens for type name.
    264                         elif tokentype in (NAME, OP) and level == 1:
    265                             super.append(token)
    266                         # Expressions in the base list are not supported.
    267                     inherit = names
    268                 if stack:
    269                     cur_obj = stack[-1][0]
    270                     cur_class = _nest_class(
    271                             cur_obj, class_name, lineno, inherit)
    272                 else:
    273                     cur_class = Class(fullmodule, class_name, inherit,
    274                                       fname, lineno)
    275                     tree[class_name] = cur_class
    276                 stack.append((cur_class, thisindent))
    277             elif token == 'import' and start[1] == 0:
    278                 modules = _getnamelist(g)
    279                 for mod, _mod2 in modules:
    280                     try:
    281                         # Recursively read the imported module.
    282                         if inpackage is None:
    283                             _readmodule(mod, path)
    284                         else:
    285                             try:
    286                                 _readmodule(mod, path, inpackage)
    287                             except ImportError:
    288                                 _readmodule(mod, [])
    289                     except:
    290                         # If we can't find or parse the imported module,
    291                         # too bad -- don't die here.
    292                         pass
    293             elif token == 'from' and start[1] == 0:
    294                 mod, token = _getname(g)
    295                 if not mod or token != "import":
    296                     continue
    297                 names = _getnamelist(g)
    298                 try:
    299                     # Recursively read the imported module.
    300                     d = _readmodule(mod, path, inpackage)
    301                 except:
    302                     # If we can't find or parse the imported module,
    303                     # too bad -- don't die here.
    304                     continue
    305                 # Add any classes that were defined in the imported module
    306                 # to our name space if they were mentioned in the list.
    307                 for n, n2 in names:
    308                     if n in d:
    309                         tree[n2 or n] = d[n]
    310                     elif n == '*':
    311                         # Don't add names that start with _.
    312                         for n in d:
    313                             if n[0] != '_':
    314                                 tree[n] = d[n]
    315     except StopIteration:
    316         pass
    317 
    318     f.close()
    319     return tree
    320 
    321 
    322 def _getnamelist(g):
    323     """Return list of (dotted-name, as-name or None) tuples for token source g.
    324 
    325     An as-name is the name that follows 'as' in an as clause.
    326     """
    327     names = []
    328     while True:
    329         name, token = _getname(g)
    330         if not name:
    331             break
    332         if token == 'as':
    333             name2, token = _getname(g)
    334         else:
    335             name2 = None
    336         names.append((name, name2))
    337         while token != "," and "\n" not in token:
    338             token = next(g)[1]
    339         if token != ",":
    340             break
    341     return names
    342 
    343 
    344 def _getname(g):
    345     "Return (dotted-name or None, next-token) tuple for token source g."
    346     parts = []
    347     tokentype, token = next(g)[0:2]
    348     if tokentype != NAME and token != '*':
    349         return (None, token)
    350     parts.append(token)
    351     while True:
    352         tokentype, token = next(g)[0:2]
    353         if token != '.':
    354             break
    355         tokentype, token = next(g)[0:2]
    356         if tokentype != NAME:
    357             break
    358         parts.append(token)
    359     return (".".join(parts), token)
    360 
    361 
    362 def _main():
    363     "Print module output (default this file) for quick visual check."
    364     import os
    365     try:
    366         mod = sys.argv[1]
    367     except:
    368         mod = __file__
    369     if os.path.exists(mod):
    370         path = [os.path.dirname(mod)]
    371         mod = os.path.basename(mod)
    372         if mod.lower().endswith(".py"):
    373             mod = mod[:-3]
    374     else:
    375         path = []
    376     tree = readmodule_ex(mod, path)
    377     lineno_key = lambda a: getattr(a, 'lineno', 0)
    378     objs = sorted(tree.values(), key=lineno_key, reverse=True)
    379     indent_level = 2
    380     while objs:
    381         obj = objs.pop()
    382         if isinstance(obj, list):
    383             # Value is a __path__ key.
    384             continue
    385         if not hasattr(obj, 'indent'):
    386             obj.indent = 0
    387 
    388         if isinstance(obj, _Object):
    389             new_objs = sorted(obj.children.values(),
    390                               key=lineno_key, reverse=True)
    391             for ob in new_objs:
    392                 ob.indent = obj.indent + indent_level
    393             objs.extend(new_objs)
    394         if isinstance(obj, Class):
    395             print("{}class {} {} {}"
    396                   .format(' ' * obj.indent, obj.name, obj.super, obj.lineno))
    397         elif isinstance(obj, Function):
    398             print("{}def {} {}".format(' ' * obj.indent, obj.name, obj.lineno))
    399 
    400 if __name__ == "__main__":
    401     _main()
    402