Home | History | Annotate | Download | only in coverage
      1 """Code parsing for Coverage."""
      2 
      3 import opcode, re, sys, token, tokenize
      4 
      5 from coverage.backward import set, sorted, StringIO # pylint: disable=W0622
      6 from coverage.backward import open_source
      7 from coverage.bytecode import ByteCodes, CodeObjects
      8 from coverage.misc import nice_pair, expensive, join_regex
      9 from coverage.misc import CoverageException, NoSource, NotPython
     10 
     11 
     12 class CodeParser(object):
     13     """Parse code to find executable lines, excluded lines, etc."""
     14 
     15     def __init__(self, text=None, filename=None, exclude=None):
     16         """
     17         Source can be provided as `text`, the text itself, or `filename`, from
     18         which the text will be read.  Excluded lines are those that match
     19         `exclude`, a regex.
     20 
     21         """
     22         assert text or filename, "CodeParser needs either text or filename"
     23         self.filename = filename or "<code>"
     24         self.text = text
     25         if not self.text:
     26             try:
     27                 sourcef = open_source(self.filename)
     28                 try:
     29                     self.text = sourcef.read()
     30                 finally:
     31                     sourcef.close()
     32             except IOError:
     33                 _, err, _ = sys.exc_info()
     34                 raise NoSource(
     35                     "No source for code: %r: %s" % (self.filename, err)
     36                     )
     37 
     38         self.exclude = exclude
     39 
     40         self.show_tokens = False
     41 
     42         # The text lines of the parsed code.
     43         self.lines = self.text.split('\n')
     44 
     45         # The line numbers of excluded lines of code.
     46         self.excluded = set()
     47 
     48         # The line numbers of docstring lines.
     49         self.docstrings = set()
     50 
     51         # The line numbers of class definitions.
     52         self.classdefs = set()
     53 
     54         # A dict mapping line numbers to (lo,hi) for multi-line statements.
     55         self.multiline = {}
     56 
     57         # The line numbers that start statements.
     58         self.statement_starts = set()
     59 
     60         # Lazily-created ByteParser
     61         self._byte_parser = None
     62 
     63     def _get_byte_parser(self):
     64         """Create a ByteParser on demand."""
     65         if not self._byte_parser:
     66             self._byte_parser = \
     67                             ByteParser(text=self.text, filename=self.filename)
     68         return self._byte_parser
     69     byte_parser = property(_get_byte_parser)
     70 
     71     def lines_matching(self, *regexes):
     72         """Find the lines matching one of a list of regexes.
     73 
     74         Returns a set of line numbers, the lines that contain a match for one
     75         of the regexes in `regexes`.  The entire line needn't match, just a
     76         part of it.
     77 
     78         """
     79         regex_c = re.compile(join_regex(regexes))
     80         matches = set()
     81         for i, ltext in enumerate(self.lines):
     82             if regex_c.search(ltext):
     83                 matches.add(i+1)
     84         return matches
     85 
     86     def _raw_parse(self):
     87         """Parse the source to find the interesting facts about its lines.
     88 
     89         A handful of member fields are updated.
     90 
     91         """
     92         # Find lines which match an exclusion pattern.
     93         if self.exclude:
     94             self.excluded = self.lines_matching(self.exclude)
     95 
     96         # Tokenize, to find excluded suites, to find docstrings, and to find
     97         # multi-line statements.
     98         indent = 0
     99         exclude_indent = 0
    100         excluding = False
    101         prev_toktype = token.INDENT
    102         first_line = None
    103         empty = True
    104 
    105         tokgen = tokenize.generate_tokens(StringIO(self.text).readline)
    106         for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen:
    107             if self.show_tokens:                # pragma: no cover
    108                 print("%10s %5s %-20r %r" % (
    109                     tokenize.tok_name.get(toktype, toktype),
    110                     nice_pair((slineno, elineno)), ttext, ltext
    111                     ))
    112             if toktype == token.INDENT:
    113                 indent += 1
    114             elif toktype == token.DEDENT:
    115                 indent -= 1
    116             elif toktype == token.NAME and ttext == 'class':
    117                 # Class definitions look like branches in the byte code, so
    118                 # we need to exclude them.  The simplest way is to note the
    119                 # lines with the 'class' keyword.
    120                 self.classdefs.add(slineno)
    121             elif toktype == token.OP and ttext == ':':
    122                 if not excluding and elineno in self.excluded:
    123                     # Start excluding a suite.  We trigger off of the colon
    124                     # token so that the #pragma comment will be recognized on
    125                     # the same line as the colon.
    126                     exclude_indent = indent
    127                     excluding = True
    128             elif toktype == token.STRING and prev_toktype == token.INDENT:
    129                 # Strings that are first on an indented line are docstrings.
    130                 # (a trick from trace.py in the stdlib.) This works for
    131                 # 99.9999% of cases.  For the rest (!) see:
    132                 # http://stackoverflow.com/questions/1769332/x/1769794#1769794
    133                 for i in range(slineno, elineno+1):
    134                     self.docstrings.add(i)
    135             elif toktype == token.NEWLINE:
    136                 if first_line is not None and elineno != first_line:
    137                     # We're at the end of a line, and we've ended on a
    138                     # different line than the first line of the statement,
    139                     # so record a multi-line range.
    140                     rng = (first_line, elineno)
    141                     for l in range(first_line, elineno+1):
    142                         self.multiline[l] = rng
    143                 first_line = None
    144 
    145             if ttext.strip() and toktype != tokenize.COMMENT:
    146                 # A non-whitespace token.
    147                 empty = False
    148                 if first_line is None:
    149                     # The token is not whitespace, and is the first in a
    150                     # statement.
    151                     first_line = slineno
    152                     # Check whether to end an excluded suite.
    153                     if excluding and indent <= exclude_indent:
    154                         excluding = False
    155                     if excluding:
    156                         self.excluded.add(elineno)
    157 
    158             prev_toktype = toktype
    159 
    160         # Find the starts of the executable statements.
    161         if not empty:
    162             self.statement_starts.update(self.byte_parser._find_statements())
    163 
    164     def first_line(self, line):
    165         """Return the first line number of the statement including `line`."""
    166         rng = self.multiline.get(line)
    167         if rng:
    168             first_line = rng[0]
    169         else:
    170             first_line = line
    171         return first_line
    172 
    173     def first_lines(self, lines, ignore=None):
    174         """Map the line numbers in `lines` to the correct first line of the
    175         statement.
    176 
    177         Skip any line mentioned in `ignore`.
    178 
    179         Returns a sorted list of the first lines.
    180 
    181         """
    182         ignore = ignore or []
    183         lset = set()
    184         for l in lines:
    185             if l in ignore:
    186                 continue
    187             new_l = self.first_line(l)
    188             if new_l not in ignore:
    189                 lset.add(new_l)
    190         return sorted(lset)
    191 
    192     def parse_source(self):
    193         """Parse source text to find executable lines, excluded lines, etc.
    194 
    195         Return values are 1) a sorted list of executable line numbers, and
    196         2) a sorted list of excluded line numbers.
    197 
    198         Reported line numbers are normalized to the first line of multi-line
    199         statements.
    200 
    201         """
    202         self._raw_parse()
    203 
    204         excluded_lines = self.first_lines(self.excluded)
    205         ignore = excluded_lines + list(self.docstrings)
    206         lines = self.first_lines(self.statement_starts, ignore)
    207 
    208         return lines, excluded_lines
    209 
    210     def arcs(self):
    211         """Get information about the arcs available in the code.
    212 
    213         Returns a sorted list of line number pairs.  Line numbers have been
    214         normalized to the first line of multiline statements.
    215 
    216         """
    217         all_arcs = []
    218         for l1, l2 in self.byte_parser._all_arcs():
    219             fl1 = self.first_line(l1)
    220             fl2 = self.first_line(l2)
    221             if fl1 != fl2:
    222                 all_arcs.append((fl1, fl2))
    223         return sorted(all_arcs)
    224     arcs = expensive(arcs)
    225 
    226     def exit_counts(self):
    227         """Get a mapping from line numbers to count of exits from that line.
    228 
    229         Excluded lines are excluded.
    230 
    231         """
    232         excluded_lines = self.first_lines(self.excluded)
    233         exit_counts = {}
    234         for l1, l2 in self.arcs():
    235             if l1 < 0:
    236                 # Don't ever report -1 as a line number
    237                 continue
    238             if l1 in excluded_lines:
    239                 # Don't report excluded lines as line numbers.
    240                 continue
    241             if l2 in excluded_lines:
    242                 # Arcs to excluded lines shouldn't count.
    243                 continue
    244             if l1 not in exit_counts:
    245                 exit_counts[l1] = 0
    246             exit_counts[l1] += 1
    247 
    248         # Class definitions have one extra exit, so remove one for each:
    249         for l in self.classdefs:
    250             # Ensure key is there: classdefs can include excluded lines.
    251             if l in exit_counts:
    252                 exit_counts[l] -= 1
    253 
    254         return exit_counts
    255     exit_counts = expensive(exit_counts)
    256 
    257 
    258 ## Opcodes that guide the ByteParser.
    259 
    260 def _opcode(name):
    261     """Return the opcode by name from the opcode module."""
    262     return opcode.opmap[name]
    263 
    264 def _opcode_set(*names):
    265     """Return a set of opcodes by the names in `names`."""
    266     s = set()
    267     for name in names:
    268         try:
    269             s.add(_opcode(name))
    270         except KeyError:
    271             pass
    272     return s
    273 
    274 # Opcodes that leave the code object.
    275 OPS_CODE_END = _opcode_set('RETURN_VALUE')
    276 
    277 # Opcodes that unconditionally end the code chunk.
    278 OPS_CHUNK_END = _opcode_set(
    279     'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
    280     'BREAK_LOOP', 'CONTINUE_LOOP',
    281     )
    282 
    283 # Opcodes that unconditionally begin a new code chunk.  By starting new chunks
    284 # with unconditional jump instructions, we neatly deal with jumps to jumps
    285 # properly.
    286 OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
    287 
    288 # Opcodes that push a block on the block stack.
    289 OPS_PUSH_BLOCK = _opcode_set(
    290     'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
    291     )
    292 
    293 # Block types for exception handling.
    294 OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
    295 
    296 # Opcodes that pop a block from the block stack.
    297 OPS_POP_BLOCK = _opcode_set('POP_BLOCK')
    298 
    299 # Opcodes that have a jump destination, but aren't really a jump.
    300 OPS_NO_JUMP = OPS_PUSH_BLOCK
    301 
    302 # Individual opcodes we need below.
    303 OP_BREAK_LOOP = _opcode('BREAK_LOOP')
    304 OP_END_FINALLY = _opcode('END_FINALLY')
    305 OP_COMPARE_OP = _opcode('COMPARE_OP')
    306 COMPARE_EXCEPTION = 10  # just have to get this const from the code.
    307 OP_LOAD_CONST = _opcode('LOAD_CONST')
    308 OP_RETURN_VALUE = _opcode('RETURN_VALUE')
    309 
    310 
    311 class ByteParser(object):
    312     """Parse byte codes to understand the structure of code."""
    313 
    314     def __init__(self, code=None, text=None, filename=None):
    315         if code:
    316             self.code = code
    317             self.text = text
    318         else:
    319             if not text:
    320                 assert filename, "If no code or text, need a filename"
    321                 sourcef = open_source(filename)
    322                 try:
    323                     text = sourcef.read()
    324                 finally:
    325                     sourcef.close()
    326             self.text = text
    327 
    328             try:
    329                 # Python 2.3 and 2.4 don't like partial last lines, so be sure
    330                 # the text ends nicely for them.
    331                 self.code = compile(text + '\n', filename, "exec")
    332             except SyntaxError:
    333                 _, synerr, _ = sys.exc_info()
    334                 raise NotPython(
    335                     "Couldn't parse '%s' as Python source: '%s' at line %d" %
    336                         (filename, synerr.msg, synerr.lineno)
    337                     )
    338 
    339         # Alternative Python implementations don't always provide all the
    340         # attributes on code objects that we need to do the analysis.
    341         for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
    342             if not hasattr(self.code, attr):
    343                 raise CoverageException(
    344                     "This implementation of Python doesn't support code "
    345                     "analysis.\n"
    346                     "Run coverage.py under CPython for this command."
    347                     )
    348 
    349     def child_parsers(self):
    350         """Iterate over all the code objects nested within this one.
    351 
    352         The iteration includes `self` as its first value.
    353 
    354         """
    355         children = CodeObjects(self.code)
    356         return [ByteParser(code=c, text=self.text) for c in children]
    357 
    358     # Getting numbers from the lnotab value changed in Py3.0.
    359     if sys.version_info >= (3, 0):
    360         def _lnotab_increments(self, lnotab):
    361             """Return a list of ints from the lnotab bytes in 3.x"""
    362             return list(lnotab)
    363     else:
    364         def _lnotab_increments(self, lnotab):
    365             """Return a list of ints from the lnotab string in 2.x"""
    366             return [ord(c) for c in lnotab]
    367 
    368     def _bytes_lines(self):
    369         """Map byte offsets to line numbers in `code`.
    370 
    371         Uses co_lnotab described in Python/compile.c to map byte offsets to
    372         line numbers.  Returns a list: [(b0, l0), (b1, l1), ...]
    373 
    374         """
    375         # Adapted from dis.py in the standard library.
    376         byte_increments = self._lnotab_increments(self.code.co_lnotab[0::2])
    377         line_increments = self._lnotab_increments(self.code.co_lnotab[1::2])
    378 
    379         bytes_lines = []
    380         last_line_num = None
    381         line_num = self.code.co_firstlineno
    382         byte_num = 0
    383         for byte_incr, line_incr in zip(byte_increments, line_increments):
    384             if byte_incr:
    385                 if line_num != last_line_num:
    386                     bytes_lines.append((byte_num, line_num))
    387                     last_line_num = line_num
    388                 byte_num += byte_incr
    389             line_num += line_incr
    390         if line_num != last_line_num:
    391             bytes_lines.append((byte_num, line_num))
    392         return bytes_lines
    393 
    394     def _find_statements(self):
    395         """Find the statements in `self.code`.
    396 
    397         Return a set of line numbers that start statements.  Recurses into all
    398         code objects reachable from `self.code`.
    399 
    400         """
    401         stmts = set()
    402         for bp in self.child_parsers():
    403             # Get all of the lineno information from this code.
    404             for _, l in bp._bytes_lines():
    405                 stmts.add(l)
    406         return stmts
    407 
    408     def _split_into_chunks(self):
    409         """Split the code object into a list of `Chunk` objects.
    410 
    411         Each chunk is only entered at its first instruction, though there can
    412         be many exits from a chunk.
    413 
    414         Returns a list of `Chunk` objects.
    415 
    416         """
    417 
    418         # The list of chunks so far, and the one we're working on.
    419         chunks = []
    420         chunk = None
    421         bytes_lines_map = dict(self._bytes_lines())
    422 
    423         # The block stack: loops and try blocks get pushed here for the
    424         # implicit jumps that can occur.
    425         # Each entry is a tuple: (block type, destination)
    426         block_stack = []
    427 
    428         # Some op codes are followed by branches that should be ignored.  This
    429         # is a count of how many ignores are left.
    430         ignore_branch = 0
    431 
    432         # We have to handle the last two bytecodes specially.
    433         ult = penult = None
    434 
    435         for bc in ByteCodes(self.code.co_code):
    436             # Maybe have to start a new chunk
    437             if bc.offset in bytes_lines_map:
    438                 # Start a new chunk for each source line number.
    439                 if chunk:
    440                     chunk.exits.add(bc.offset)
    441                 chunk = Chunk(bc.offset, bytes_lines_map[bc.offset])
    442                 chunks.append(chunk)
    443             elif bc.op in OPS_CHUNK_BEGIN:
    444                 # Jumps deserve their own unnumbered chunk.  This fixes
    445                 # problems with jumps to jumps getting confused.
    446                 if chunk:
    447                     chunk.exits.add(bc.offset)
    448                 chunk = Chunk(bc.offset)
    449                 chunks.append(chunk)
    450 
    451             if not chunk:
    452                 chunk = Chunk(bc.offset)
    453                 chunks.append(chunk)
    454 
    455             # Look at the opcode
    456             if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP:
    457                 if ignore_branch:
    458                     # Someone earlier wanted us to ignore this branch.
    459                     ignore_branch -= 1
    460                 else:
    461                     # The opcode has a jump, it's an exit for this chunk.
    462                     chunk.exits.add(bc.jump_to)
    463 
    464             if bc.op in OPS_CODE_END:
    465                 # The opcode can exit the code object.
    466                 chunk.exits.add(-self.code.co_firstlineno)
    467             if bc.op in OPS_PUSH_BLOCK:
    468                 # The opcode adds a block to the block_stack.
    469                 block_stack.append((bc.op, bc.jump_to))
    470             if bc.op in OPS_POP_BLOCK:
    471                 # The opcode pops a block from the block stack.
    472                 block_stack.pop()
    473             if bc.op in OPS_CHUNK_END:
    474                 # This opcode forces the end of the chunk.
    475                 if bc.op == OP_BREAK_LOOP:
    476                     # A break is implicit: jump where the top of the
    477                     # block_stack points.
    478                     chunk.exits.add(block_stack[-1][1])
    479                 chunk = None
    480             if bc.op == OP_END_FINALLY:
    481                 if block_stack:
    482                     # A break that goes through a finally will jump to whatever
    483                     # block is on top of the stack.
    484                     chunk.exits.add(block_stack[-1][1])
    485                 # For the finally clause we need to find the closest exception
    486                 # block, and use its jump target as an exit.
    487                 for iblock in range(len(block_stack)-1, -1, -1):
    488                     if block_stack[iblock][0] in OPS_EXCEPT_BLOCKS:
    489                         chunk.exits.add(block_stack[iblock][1])
    490                         break
    491             if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION:
    492                 # This is an except clause.  We want to overlook the next
    493                 # branch, so that except's don't count as branches.
    494                 ignore_branch += 1
    495 
    496             penult = ult
    497             ult = bc
    498 
    499         if chunks:
    500             # The last two bytecodes could be a dummy "return None" that
    501             # shouldn't be counted as real code. Every Python code object seems
    502             # to end with a return, and a "return None" is inserted if there
    503             # isn't an explicit return in the source.
    504             if ult and penult:
    505                 if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE:
    506                     if self.code.co_consts[penult.arg] is None:
    507                         # This is "return None", but is it dummy?  A real line
    508                         # would be a last chunk all by itself.
    509                         if chunks[-1].byte != penult.offset:
    510                             ex = -self.code.co_firstlineno
    511                             # Split the last chunk
    512                             last_chunk = chunks[-1]
    513                             last_chunk.exits.remove(ex)
    514                             last_chunk.exits.add(penult.offset)
    515                             chunk = Chunk(penult.offset)
    516                             chunk.exits.add(ex)
    517                             chunks.append(chunk)
    518 
    519             # Give all the chunks a length.
    520             chunks[-1].length = bc.next_offset - chunks[-1].byte
    521             for i in range(len(chunks)-1):
    522                 chunks[i].length = chunks[i+1].byte - chunks[i].byte
    523 
    524         return chunks
    525 
    526     def _arcs(self):
    527         """Find the executable arcs in the code.
    528 
    529         Returns a set of pairs, (from,to).  From and to are integer line
    530         numbers.  If from is < 0, then the arc is an entrance into the code
    531         object.  If to is < 0, the arc is an exit from the code object.
    532 
    533         """
    534         chunks = self._split_into_chunks()
    535 
    536         # A map from byte offsets to chunks jumped into.
    537         byte_chunks = dict([(c.byte, c) for c in chunks])
    538 
    539         # Build a map from byte offsets to actual lines reached.
    540         byte_lines = {}
    541         bytes_to_add = set([c.byte for c in chunks])
    542 
    543         while bytes_to_add:
    544             byte_to_add = bytes_to_add.pop()
    545             if byte_to_add in byte_lines or byte_to_add < 0:
    546                 continue
    547 
    548             # Which lines does this chunk lead to?
    549             bytes_considered = set()
    550             bytes_to_consider = [byte_to_add]
    551             lines = set()
    552 
    553             while bytes_to_consider:
    554                 byte = bytes_to_consider.pop()
    555                 bytes_considered.add(byte)
    556 
    557                 # Find chunk for byte
    558                 try:
    559                     ch = byte_chunks[byte]
    560                 except KeyError:
    561                     for ch in chunks:
    562                         if ch.byte <= byte < ch.byte+ch.length:
    563                             break
    564                     else:
    565                         # No chunk for this byte!
    566                         raise Exception("Couldn't find chunk @ %d" % byte)
    567                     byte_chunks[byte] = ch
    568 
    569                 if ch.line:
    570                     lines.add(ch.line)
    571                 else:
    572                     for ex in ch.exits:
    573                         if ex < 0:
    574                             lines.add(ex)
    575                         elif ex not in bytes_considered:
    576                             bytes_to_consider.append(ex)
    577 
    578                 bytes_to_add.update(ch.exits)
    579 
    580             byte_lines[byte_to_add] = lines
    581 
    582         # Figure out for each chunk where the exits go.
    583         arcs = set()
    584         for chunk in chunks:
    585             if chunk.line:
    586                 for ex in chunk.exits:
    587                     if ex < 0:
    588                         exit_lines = [ex]
    589                     else:
    590                         exit_lines = byte_lines[ex]
    591                     for exit_line in exit_lines:
    592                         if chunk.line != exit_line:
    593                             arcs.add((chunk.line, exit_line))
    594         for line in byte_lines[0]:
    595             arcs.add((-1, line))
    596 
    597         return arcs
    598 
    599     def _all_chunks(self):
    600         """Returns a list of `Chunk` objects for this code and its children.
    601 
    602         See `_split_into_chunks` for details.
    603 
    604         """
    605         chunks = []
    606         for bp in self.child_parsers():
    607             chunks.extend(bp._split_into_chunks())
    608 
    609         return chunks
    610 
    611     def _all_arcs(self):
    612         """Get the set of all arcs in this code object and its children.
    613 
    614         See `_arcs` for details.
    615 
    616         """
    617         arcs = set()
    618         for bp in self.child_parsers():
    619             arcs.update(bp._arcs())
    620 
    621         return arcs
    622 
    623 
    624 class Chunk(object):
    625     """A sequence of bytecodes with a single entrance.
    626 
    627     To analyze byte code, we have to divide it into chunks, sequences of byte
    628     codes such that each basic block has only one entrance, the first
    629     instruction in the block.
    630 
    631     This is almost the CS concept of `basic block`_, except that we're willing
    632     to have many exits from a chunk, and "basic block" is a more cumbersome
    633     term.
    634 
    635     .. _basic block: http://en.wikipedia.org/wiki/Basic_block
    636 
    637     An exit < 0 means the chunk can leave the code (return).  The exit is
    638     the negative of the starting line number of the code block.
    639 
    640     """
    641     def __init__(self, byte, line=0):
    642         self.byte = byte
    643         self.line = line
    644         self.length = 0
    645         self.exits = set()
    646 
    647     def __repr__(self):
    648         return "<%d+%d @%d %r>" % (
    649             self.byte, self.length, self.line, list(self.exits)
    650             )
    651