Home | History | Annotate | Download | only in sanitizers
      1 #!/usr/bin/env python
      2 # Copyright 2016 the V8 project authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Script to transform and merge sancov files into human readable json-format.
      7 
      8 The script supports three actions:
      9 all: Writes a json file with all instrumented lines of all executables.
     10 merge: Merges sancov files with coverage output into an existing json file.
     11 split: Split json file into separate files per covered source file.
     12 
     13 The json data is structured as follows:
     14 {
     15   "version": 1,
     16   "tests": ["executable1", "executable2", ...],
     17   "files": {
     18     "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...],
     19     "file2": [...],
     20     ...
     21   }
     22 }
     23 
     24 The executables are sorted and determine the test bit mask. Their index+1 is
     25 the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by
     26 executable1 and executable3 will have bit_mask == 5 == 0b101. The number of
     27 tests is restricted to 52 in version 1, to allow javascript JSON parsing of
     28 the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
     29 
     30 The line-number-bit_mask pairs are sorted by line number and don't contain
     31 duplicates.
     32 
     33 Split json data preserves the same format, but only contains one file per
     34 json file.
     35 
     36 The sancov tool is expected to be in the llvm compiler-rt third-party
     37 directory. It's not checked out by default and must be added as a custom deps:
     38 'v8/third_party/llvm/projects/compiler-rt':
     39     'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git'
     40 """
     41 
     42 import argparse
     43 import json
     44 import logging
     45 import os
     46 import re
     47 import subprocess
     48 import sys
     49 
     50 from multiprocessing import Pool, cpu_count
     51 
     52 
     53 logging.basicConfig(level=logging.INFO)
     54 
     55 # Files to exclude from coverage. Dropping their data early adds more speed.
     56 # The contained cc files are already excluded from instrumentation, but inlined
     57 # data is referenced through v8's object files.
     58 EXCLUSIONS = [
     59   'buildtools',
     60   'src/third_party',
     61   'third_party',
     62   'test',
     63   'testing',
     64 ]
     65 
     66 # Executables found in the build output for which no coverage is generated.
     67 # Exclude them from the coverage data file.
     68 EXE_BLACKLIST = [
     69   'generate-bytecode-expectations',
     70   'hello-world',
     71   'mksnapshot',
     72   'parser-shell',
     73   'process',
     74   'shell',
     75 ]
     76 
     77 # V8 checkout directory.
     78 BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(
     79     os.path.abspath(__file__))))
     80 
     81 # Executable location. TODO(machenbach): Only release is supported for now.
     82 BUILD_DIR = os.path.join(BASE_DIR, 'out', 'Release')
     83 
     84 # Path prefix added by the llvm symbolizer including trailing slash.
     85 OUTPUT_PATH_PREFIX = os.path.join(BUILD_DIR, '..', '..', '')
     86 
     87 # The sancov tool location.
     88 SANCOV_TOOL = os.path.join(
     89     BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt',
     90     'lib', 'sanitizer_common', 'scripts', 'sancov.py')
     91 
     92 # Simple script to sanitize the PCs from objdump.
     93 SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py')
     94 
     95 # The llvm symbolizer location.
     96 SYMBOLIZER = os.path.join(
     97     BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin',
     98     'llvm-symbolizer')
     99 
    100 # Number of cpus.
    101 CPUS = cpu_count()
    102 
    103 # Regexp to find sancov files as output by sancov_merger.py. Also grabs the
    104 # executable name in group 1.
    105 SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$')
    106 
    107 
    108 def executables():
    109   """Iterates over executable files in the build directory."""
    110   for f in os.listdir(BUILD_DIR):
    111     file_path = os.path.join(BUILD_DIR, f)
    112     if (os.path.isfile(file_path) and
    113         os.access(file_path, os.X_OK) and
    114         f not in EXE_BLACKLIST):
    115       yield file_path
    116 
    117 
    118 def process_symbolizer_output(output):
    119   """Post-process llvm symbolizer output.
    120 
    121   Excludes files outside the v8 checkout or given in exclusion list above
    122   from further processing. Drops the character index in each line.
    123 
    124   Returns: A mapping of file names to lists of line numbers. The file names
    125            have relative paths to the v8 base directory. The lists of line
    126            numbers don't contain duplicate lines and are sorted.
    127   """
    128   # Drop path prefix when iterating lines. The path is redundant and takes
    129   # too much space. Drop files outside that path, e.g. generated files in
    130   # the build dir and absolute paths to c++ library headers.
    131   def iter_lines():
    132     for line in output.strip().splitlines():
    133       if line.startswith(OUTPUT_PATH_PREFIX):
    134         yield line[len(OUTPUT_PATH_PREFIX):]
    135 
    136   # Map file names to sets of instrumented line numbers.
    137   file_map = {}
    138   for line in iter_lines():
    139     # Drop character number, we only care for line numbers. Each line has the
    140     # form: <file name>:<line number>:<character number>.
    141     file_name, number, _ = line.split(':')
    142     file_map.setdefault(file_name, set([])).add(int(number))
    143 
    144   # Remove exclusion patterns from file map. It's cheaper to do it after the
    145   # mapping, as there are few excluded files and we don't want to do this
    146   # check for numerous lines in ordinary files.
    147   def keep(file_name):
    148     for e in EXCLUSIONS:
    149       if file_name.startswith(e):
    150         return False
    151     return True
    152 
    153   # Return in serializable form and filter.
    154   return {k: sorted(file_map[k]) for k in file_map if keep(k)}
    155 
    156 
    157 def get_instrumented_lines(executable):
    158   """Return the instrumented lines of an executable.
    159 
    160   Called trough multiprocessing pool.
    161 
    162   Returns: Post-processed llvm output as returned by process_symbolizer_output.
    163   """
    164   # The first two pipes are from llvm's tool sancov.py with 0x added to the hex
    165   # numbers. The results are piped into the llvm symbolizer, which outputs for
    166   # each PC: <file name with abs path>:<line number>:<character number>.
    167   # We don't call the sancov tool to get more speed.
    168   process = subprocess.Popen(
    169       'objdump -d %s | '
    170       'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ '
    171       '<__sanitizer_cov\(_with_check\|\)\(@plt\|\)>\' | '
    172       'grep \'^\s\+[0-9a-f]\+\' -o | '
    173       '%s | '
    174       '%s --obj %s -functions=none' %
    175           (executable, SANITIZE_PCS, SYMBOLIZER, executable),
    176       stdout=subprocess.PIPE,
    177       stderr=subprocess.PIPE,
    178       stdin=subprocess.PIPE,
    179       cwd=BASE_DIR,
    180       shell=True,
    181   )
    182   output, _ = process.communicate()
    183   assert process.returncode == 0
    184   return process_symbolizer_output(output)
    185 
    186 
    187 def merge_instrumented_line_results(exe_list, results):
    188   """Merge multiprocessing results for all instrumented lines.
    189 
    190   Args:
    191     exe_list: List of all executable names with absolute paths.
    192     results: List of results as returned by get_instrumented_lines.
    193 
    194   Returns: Dict to be used as json data as specified on the top of this page.
    195            The dictionary contains all instrumented lines of all files
    196            referenced by all executables.
    197   """
    198   def merge_files(x, y):
    199     for file_name, lines in y.iteritems():
    200       x.setdefault(file_name, set([])).update(lines)
    201     return x
    202   result = reduce(merge_files, results, {})
    203 
    204   # Return data as file->lines mapping. The lines are saved as lists
    205   # with (line number, test bits (as int)). The test bits are initialized with
    206   # 0, meaning instrumented, but no coverage.
    207   # The order of the test bits is given with key 'tests'. For now, these are
    208   # the executable names. We use a _list_ with two items instead of a tuple to
    209   # ease merging by allowing mutation of the second item.
    210   return {
    211     'version': 1,
    212     'tests': sorted(map(os.path.basename, exe_list)),
    213     'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result},
    214   }
    215 
    216 
    217 def write_instrumented(options):
    218   """Implements the 'all' action of this tool."""
    219   exe_list = list(executables())
    220   logging.info('Reading instrumented lines from %d executables.',
    221                len(exe_list))
    222   pool = Pool(CPUS)
    223   try:
    224     results = pool.imap_unordered(get_instrumented_lines, exe_list)
    225   finally:
    226     pool.close()
    227 
    228   # Merge multiprocessing results and prepare output data.
    229   data = merge_instrumented_line_results(exe_list, results)
    230 
    231   logging.info('Read data from %d executables, which covers %d files.',
    232                len(data['tests']), len(data['files']))
    233   logging.info('Writing results to %s', options.json_output)
    234 
    235   # Write json output.
    236   with open(options.json_output, 'w') as f:
    237     json.dump(data, f, sort_keys=True)
    238 
    239 
    240 def get_covered_lines(args):
    241   """Return the covered lines of an executable.
    242 
    243   Called trough multiprocessing pool. The args are expected to unpack to:
    244     cov_dir: Folder with sancov files merged by sancov_merger.py.
    245     executable: The executable that was called to produce the given coverage
    246                 data.
    247     sancov_file: The merged sancov file with coverage data.
    248 
    249   Returns: A tuple of post-processed llvm output as returned by
    250            process_symbolizer_output and the executable name.
    251   """
    252   cov_dir, executable, sancov_file = args
    253 
    254   # Let the sancov tool print the covered PCs and pipe them through the llvm
    255   # symbolizer.
    256   process = subprocess.Popen(
    257       '%s print %s 2> /dev/null | '
    258       '%s --obj %s -functions=none' %
    259           (SANCOV_TOOL,
    260            os.path.join(cov_dir, sancov_file),
    261            SYMBOLIZER,
    262            os.path.join(BUILD_DIR, executable)),
    263       stdout=subprocess.PIPE,
    264       stderr=subprocess.PIPE,
    265       stdin=subprocess.PIPE,
    266       cwd=BASE_DIR,
    267       shell=True,
    268   )
    269   output, _ = process.communicate()
    270   assert process.returncode == 0
    271   return process_symbolizer_output(output), executable
    272 
    273 
    274 def merge_covered_line_results(data, results):
    275   """Merge multiprocessing results for covered lines.
    276 
    277   The data is mutated, the results are merged into it in place.
    278 
    279   Args:
    280     data: Existing coverage data from json file containing all instrumented
    281           lines.
    282     results: List of results as returned by get_covered_lines.
    283   """
    284 
    285   # List of executables and mapping to the test bit mask. The number of
    286   # tests is restricted to 52, to allow javascript JSON parsing of
    287   # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
    288   exe_list = data['tests']
    289   assert len(exe_list) <= 52, 'Max 52 different tests are supported.'
    290   test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)}
    291 
    292   def merge_lines(old_lines, new_lines, mask):
    293     """Merge the coverage data of a list of lines.
    294 
    295     Args:
    296       old_lines: Lines as list of pairs with line number and test bit mask.
    297                  The new lines will be merged into the list in place.
    298       new_lines: List of new (covered) lines (sorted).
    299       mask: The bit to be set for covered lines. The bit index is the test
    300             index of the executable that covered the line.
    301     """
    302     i = 0
    303     # Iterate over old and new lines, both are sorted.
    304     for l in new_lines:
    305       while old_lines[i][0] < l:
    306         # Forward instrumented lines not present in this coverage data.
    307         i += 1
    308         # TODO: Add more context to the assert message.
    309         assert i < len(old_lines), 'Covered line %d not in input file.' % l
    310       assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l
    311 
    312       # Add coverage information to the line.
    313       old_lines[i][1] |= mask
    314 
    315   def merge_files(data, result):
    316     """Merge result into data.
    317 
    318     The data is mutated in place.
    319 
    320     Args:
    321       data: Merged coverage data from the previous reduce step.
    322       result: New result to be merged in. The type is as returned by
    323               get_covered_lines.
    324     """
    325     file_map, executable = result
    326     files = data['files']
    327     for file_name, lines in file_map.iteritems():
    328       merge_lines(files[file_name], lines, test_bit_masks[executable])
    329     return data
    330 
    331   reduce(merge_files, results, data)
    332 
    333 
    334 def merge(options):
    335   """Implements the 'merge' action of this tool."""
    336 
    337   # Check if folder with coverage output exists.
    338   assert (os.path.exists(options.coverage_dir) and
    339           os.path.isdir(options.coverage_dir))
    340 
    341   # Inputs for multiprocessing. List of tuples of:
    342   # Coverage dir, executable name, sancov file name.
    343   inputs = []
    344   for f in os.listdir(options.coverage_dir):
    345     match = SANCOV_FILE_RE.match(f)
    346     if match:
    347       inputs.append((options.coverage_dir, match.group(1), f))
    348 
    349   logging.info('Merging %d sancov files into %s',
    350                len(inputs), options.json_input)
    351 
    352   # Post-process covered lines in parallel.
    353   pool = Pool(CPUS)
    354   try:
    355     results = pool.imap_unordered(get_covered_lines, inputs)
    356   finally:
    357     pool.close()
    358 
    359   # Load existing json data file for merging the results.
    360   with open(options.json_input, 'r') as f:
    361     data = json.load(f)
    362 
    363   # Merge muliprocessing results. Mutates data.
    364   merge_covered_line_results(data, results)
    365 
    366   logging.info('Merged data from %d executables, which covers %d files.',
    367                len(data['tests']), len(data['files']))
    368   logging.info('Writing results to %s', options.json_output)
    369 
    370   # Write merged results to file.
    371   with open(options.json_output, 'w') as f:
    372     json.dump(data, f, sort_keys=True)
    373 
    374 
    375 def split(options):
    376   """Implements the 'split' action of this tool."""
    377   # Load existing json data file for splitting.
    378   with open(options.json_input, 'r') as f:
    379     data = json.load(f)
    380 
    381   logging.info('Splitting off %d coverage files from %s',
    382                len(data['files']), options.json_input)
    383 
    384   for file_name, coverage in data['files'].iteritems():
    385     # Preserve relative directories that are part of the file name.
    386     file_path = os.path.join(options.output_dir, file_name + '.json')
    387     try:
    388       os.makedirs(os.path.dirname(file_path))
    389     except OSError:
    390       # Ignore existing directories.
    391       pass
    392 
    393     with open(file_path, 'w') as f:
    394       # Flat-copy the old dict.
    395       new_data = dict(data)
    396 
    397       # Update current file.
    398       new_data['files'] = {file_name: coverage}
    399 
    400       # Write json data.
    401       json.dump(new_data, f, sort_keys=True)
    402 
    403 
    404 def main(args=None):
    405   parser = argparse.ArgumentParser()
    406   parser.add_argument('--coverage-dir',
    407                       help='Path to the sancov output files.')
    408   parser.add_argument('--json-input',
    409                       help='Path to an existing json file with coverage data.')
    410   parser.add_argument('--json-output',
    411                       help='Path to a file to write json output to.')
    412   parser.add_argument('--output-dir',
    413                       help='Directory where to put split output files to.')
    414   parser.add_argument('action', choices=['all', 'merge', 'split'],
    415                       help='Action to perform.')
    416 
    417   options = parser.parse_args(args)
    418   if options.action.lower() == 'all':
    419     if not options.json_output:
    420       print '--json-output is required'
    421       return 1
    422     write_instrumented(options)
    423   elif options.action.lower() == 'merge':
    424     if not options.coverage_dir:
    425       print '--coverage-dir is required'
    426       return 1
    427     if not options.json_input:
    428       print '--json-input is required'
    429       return 1
    430     if not options.json_output:
    431       print '--json-output is required'
    432       return 1
    433     merge(options)
    434   elif options.action.lower() == 'split':
    435     if not options.json_input:
    436       print '--json-input is required'
    437       return 1
    438     if not options.output_dir:
    439       print '--output-dir is required'
    440       return 1
    441     split(options)
    442   return 0
    443 
    444 
    445 if __name__ == '__main__':
    446   sys.exit(main())
    447