skia/tools/parse_llvm_coverage.py

#!/usr/bin/env python
# Copyright (c) 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.


"""Parse an LLVM coverage report to generate useable results."""


import argparse
import json
import os
import re
import subprocess
import sys


def _fix_filename(filename):
  """Return a filename which we can use to identify the file.

  The file paths printed by llvm-cov take the form:

      /path/to/repo/out/dir/../../src/filename.cpp

  And then they're truncated to 22 characters with leading ellipses:

      ...../../src/filename.cpp

  This makes it really tough to determine whether the file actually belongs in
  the Skia repo.  This function strips out the leading junk so that, if the file
  exists in the repo, the returned string matches the end of some relative path
  in the repo. This doesn't guarantee correctness, but it's about as close as
  we can get.
  """
  return filename.split('..')[-1].lstrip('./')


def _file_in_repo(filename, all_files):
  """Return the name of the checked-in file matching the given filename.

  Use suffix matching to determine which checked-in files the given filename
  matches. If there are no matches or multiple matches, return None.
  """
  new_file = _fix_filename(filename)
  matched = []
  for f in all_files:
    if f.endswith(new_file):
      matched.append(f)
  if len(matched) == 1:
    return matched[0]
  elif len(matched) > 1:
    print >> sys.stderr, ('WARNING: multiple matches for %s; skipping:\n\t%s'
                          % (new_file, '\n\t'.join(matched)))
  return None


def _get_per_file_per_line_coverage(report):
  """Return a dict whose keys are file names and values are coverage data.

  Values are lists which take the form (lineno, coverage, code).
  """
  all_files = []
  for root, dirs, files in os.walk(os.getcwd()):
    if 'third_party/externals' in root:
      continue
    files = [f for f in files if not (f[0] == '.' or f.endswith('.pyc'))]
    dirs[:] = [d for d in dirs if not d[0] == '.']
    for name in files:
      all_files.append(os.path.join(root[(len(os.getcwd()) + 1):], name))
  all_files.sort()

  lines = report.splitlines()
  current_file = None
  file_lines = []
  files = {}
  not_checked_in = '%' # Use this as the file name for not-checked-in files.
  for line in lines:
    m = re.match('([a-zA-Z0-9\./_-]+):', line)
    if m:
      if current_file and current_file != not_checked_in:
        files[current_file] = file_lines
      match_filename = _file_in_repo(m.groups()[0], all_files)
      current_file = match_filename or not_checked_in
      file_lines = []
    else:
      if current_file != not_checked_in:
        skip = re.match('^\s{2}-+$|^\s{2}\|.+$', line)
        if line and not skip:
          cov, linenum, code = line.split('|', 2)
          cov = cov.strip()
          if cov:
            cov = int(cov)
          else:
            cov = None # We don't care about coverage for this line.
          linenum = int(linenum.strip())
          assert linenum == len(file_lines) + 1
          file_lines.append((linenum, cov, code.decode('utf-8', 'replace')))
  return files


def _testname(filename):
  """Transform the file name into an ingestible test name."""
  return re.sub(r'[^a-zA-Z0-9]', '_', filename)


def _nanobench_json(results, properties, key):
  """Return the results in JSON format like that produced by nanobench."""
  rv = {}
  # Copy over the properties first, then set the 'key' and 'results' keys,
  # in order to avoid bad formatting in case the user passes in a properties
  # dict containing those keys.
  rv.update(properties)
  rv['key'] = key
  rv['results'] = {
    _testname(f): {
      'coverage': {
        'percent': percent,
        'lines_not_covered': not_covered_lines,
        'options': {
          'fullname': f,
          'dir': os.path.dirname(f),
          'source_type': 'coverage',
        },
      },
    } for percent, not_covered_lines, f in results
  }
  return rv


def _parse_key_value(kv_list):
  """Return a dict whose key/value pairs are derived from the given list.

  For example:

      ['k1', 'v1', 'k2', 'v2']
  becomes:

      {'k1': 'v1',
       'k2': 'v2'}
  """
  if len(kv_list) % 2 != 0:
    raise Exception('Invalid key/value pairs: %s' % kv_list)

  rv = {}
  for i in xrange(len(kv_list) / 2):
    rv[kv_list[i*2]] = kv_list[i*2+1]
  return rv


def _get_per_file_summaries(line_by_line):
  """Summarize the full line-by-line coverage report by file."""
  per_file = []
  for filepath, lines in line_by_line.iteritems():
    total_lines = 0
    covered_lines = 0
    for _, cov, _ in lines:
      if cov is not None:
        total_lines += 1
        if cov > 0:
          covered_lines += 1
    if total_lines > 0:
      per_file.append((float(covered_lines)/float(total_lines)*100.0,
                       total_lines - covered_lines,
                       filepath))
  return per_file


def main():
  """Generate useful data from a coverage report."""
  # Parse args.
  parser = argparse.ArgumentParser()
  parser.add_argument('--report', help='input file; an llvm coverage report.',
                      required=True)
  parser.add_argument('--nanobench', help='output file for nanobench data.')
  parser.add_argument(
      '--key', metavar='key_or_value', nargs='+',
      help='key/value pairs identifying this bot.')
  parser.add_argument(
      '--properties', metavar='key_or_value', nargs='+',
      help='key/value pairs representing properties of this build.')
  parser.add_argument('--linebyline',
                      help='output file for line-by-line JSON data.')
  args = parser.parse_args()

  if args.nanobench and not (args.key and args.properties):
    raise Exception('--key and --properties are required with --nanobench')

  with open(args.report) as f:
    report = f.read()

  line_by_line = _get_per_file_per_line_coverage(report)

  if args.linebyline:
    with open(args.linebyline, 'w') as f:
      json.dump(line_by_line, f)

  if args.nanobench:
    # Parse the key and properties for use in the nanobench JSON output.
    key = _parse_key_value(args.key)
    properties = _parse_key_value(args.properties)

    # Get per-file summaries.
    per_file = _get_per_file_summaries(line_by_line)

    # Write results.
    format_results = _nanobench_json(per_file, properties, key)
    with open(args.nanobench, 'w') as f:
      json.dump(format_results, f)


if __name__ == '__main__':
  main()