Home | History | Annotate | Download | only in libscanbuild
      1 # -*- coding: utf-8 -*-
      2 #                     The LLVM Compiler Infrastructure
      3 #
      4 # This file is distributed under the University of Illinois Open Source
      5 # License. See LICENSE.TXT for details.
      6 """ This module is responsible to capture the compiler invocation of any
      7 build process. The result of that should be a compilation database.
      8 
      9 This implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
     10 mechanisms provided by the dynamic linker. The related library is implemented
     11 in C language and can be found under 'libear' directory.
     12 
     13 The 'libear' library is capturing all child process creation and logging the
     14 relevant information about it into separate files in a specified directory.
     15 The parameter of this process is the output directory name, where the report
     16 files shall be placed. This parameter is passed as an environment variable.
     17 
     18 The module also implements compiler wrappers to intercept the compiler calls.
     19 
     20 The module implements the build command execution and the post-processing of
     21 the output files, which will condensates into a compilation database. """
     22 
     23 import sys
     24 import os
     25 import os.path
     26 import re
     27 import itertools
     28 import json
     29 import glob
     30 import argparse
     31 import logging
     32 import subprocess
     33 from libear import build_libear, TemporaryDirectory
     34 from libscanbuild import command_entry_point
     35 from libscanbuild import duplicate_check, tempdir, initialize_logging
     36 from libscanbuild.compilation import split_command
     37 from libscanbuild.shell import encode, decode
     38 
     39 __all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper']
     40 
     41 GS = chr(0x1d)
     42 RS = chr(0x1e)
     43 US = chr(0x1f)
     44 
     45 COMPILER_WRAPPER_CC = 'intercept-cc'
     46 COMPILER_WRAPPER_CXX = 'intercept-c++'
     47 
     48 
     49 @command_entry_point
     50 def intercept_build_main(bin_dir):
     51     """ Entry point for 'intercept-build' command. """
     52 
     53     parser = create_parser()
     54     args = parser.parse_args()
     55 
     56     initialize_logging(args.verbose)
     57     logging.debug('Parsed arguments: %s', args)
     58 
     59     if not args.build:
     60         parser.print_help()
     61         return 0
     62 
     63     return capture(args, bin_dir)
     64 
     65 
     66 def capture(args, bin_dir):
     67     """ The entry point of build command interception. """
     68 
     69     def post_processing(commands):
     70         """ To make a compilation database, it needs to filter out commands
     71         which are not compiler calls. Needs to find the source file name
     72         from the arguments. And do shell escaping on the command.
     73 
     74         To support incremental builds, it is desired to read elements from
     75         an existing compilation database from a previous run. These elements
     76         shall be merged with the new elements. """
     77 
     78         # create entries from the current run
     79         current = itertools.chain.from_iterable(
     80             # creates a sequence of entry generators from an exec,
     81             format_entry(command) for command in commands)
     82         # read entries from previous run
     83         if 'append' in args and args.append and os.path.isfile(args.cdb):
     84             with open(args.cdb) as handle:
     85                 previous = iter(json.load(handle))
     86         else:
     87             previous = iter([])
     88         # filter out duplicate entries from both
     89         duplicate = duplicate_check(entry_hash)
     90         return (entry
     91                 for entry in itertools.chain(previous, current)
     92                 if os.path.exists(entry['file']) and not duplicate(entry))
     93 
     94     with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir:
     95         # run the build command
     96         environment = setup_environment(args, tmp_dir, bin_dir)
     97         logging.debug('run build in environment: %s', environment)
     98         exit_code = subprocess.call(args.build, env=environment)
     99         logging.info('build finished with exit code: %d', exit_code)
    100         # read the intercepted exec calls
    101         exec_traces = itertools.chain.from_iterable(
    102             parse_exec_trace(os.path.join(tmp_dir, filename))
    103             for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
    104         # do post processing only if that was requested
    105         if 'raw_entries' not in args or not args.raw_entries:
    106             entries = post_processing(exec_traces)
    107         else:
    108             entries = exec_traces
    109         # dump the compilation database
    110         with open(args.cdb, 'w+') as handle:
    111             json.dump(list(entries), handle, sort_keys=True, indent=4)
    112         return exit_code
    113 
    114 
    115 def setup_environment(args, destination, bin_dir):
    116     """ Sets up the environment for the build command.
    117 
    118     It sets the required environment variables and execute the given command.
    119     The exec calls will be logged by the 'libear' preloaded library or by the
    120     'wrapper' programs. """
    121 
    122     c_compiler = args.cc if 'cc' in args else 'cc'
    123     cxx_compiler = args.cxx if 'cxx' in args else 'c++'
    124 
    125     libear_path = None if args.override_compiler or is_preload_disabled(
    126         sys.platform) else build_libear(c_compiler, destination)
    127 
    128     environment = dict(os.environ)
    129     environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
    130 
    131     if not libear_path:
    132         logging.debug('intercept gonna use compiler wrappers')
    133         environment.update({
    134             'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC),
    135             'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX),
    136             'INTERCEPT_BUILD_CC': c_compiler,
    137             'INTERCEPT_BUILD_CXX': cxx_compiler,
    138             'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO'
    139         })
    140     elif sys.platform == 'darwin':
    141         logging.debug('intercept gonna preload libear on OSX')
    142         environment.update({
    143             'DYLD_INSERT_LIBRARIES': libear_path,
    144             'DYLD_FORCE_FLAT_NAMESPACE': '1'
    145         })
    146     else:
    147         logging.debug('intercept gonna preload libear on UNIX')
    148         environment.update({'LD_PRELOAD': libear_path})
    149 
    150     return environment
    151 
    152 
    153 def intercept_build_wrapper(cplusplus):
    154     """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers.
    155 
    156     It does generate execution report into target directory. And execute
    157     the wrapped compilation with the real compiler. The parameters for
    158     report and execution are from environment variables.
    159 
    160     Those parameters which for 'libear' library can't have meaningful
    161     values are faked. """
    162 
    163     # initialize wrapper logging
    164     logging.basicConfig(format='intercept: %(levelname)s: %(message)s',
    165                         level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO'))
    166     # write report
    167     try:
    168         target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
    169         if not target_dir:
    170             raise UserWarning('exec report target directory not found')
    171         pid = str(os.getpid())
    172         target_file = os.path.join(target_dir, pid + '.cmd')
    173         logging.debug('writing exec report to: %s', target_file)
    174         with open(target_file, 'ab') as handler:
    175             working_dir = os.getcwd()
    176             command = US.join(sys.argv) + US
    177             content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS
    178             handler.write(content.encode('utf-8'))
    179     except IOError:
    180         logging.exception('writing exec report failed')
    181     except UserWarning as warning:
    182         logging.warning(warning)
    183     # execute with real compiler
    184     compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \
    185         else os.getenv('INTERCEPT_BUILD_CC', 'cc')
    186     compilation = [compiler] + sys.argv[1:]
    187     logging.debug('execute compiler: %s', compilation)
    188     return subprocess.call(compilation)
    189 
    190 
    191 def parse_exec_trace(filename):
    192     """ Parse the file generated by the 'libear' preloaded library.
    193 
    194     Given filename points to a file which contains the basic report
    195     generated by the interception library or wrapper command. A single
    196     report file _might_ contain multiple process creation info. """
    197 
    198     logging.debug('parse exec trace file: %s', filename)
    199     with open(filename, 'r') as handler:
    200         content = handler.read()
    201         for group in filter(bool, content.split(GS)):
    202             records = group.split(RS)
    203             yield {
    204                 'pid': records[0],
    205                 'ppid': records[1],
    206                 'function': records[2],
    207                 'directory': records[3],
    208                 'command': records[4].split(US)[:-1]
    209             }
    210 
    211 
    212 def format_entry(exec_trace):
    213     """ Generate the desired fields for compilation database entries. """
    214 
    215     def abspath(cwd, name):
    216         """ Create normalized absolute path from input filename. """
    217         fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
    218         return os.path.normpath(fullname)
    219 
    220     logging.debug('format this command: %s', exec_trace['command'])
    221     compilation = split_command(exec_trace['command'])
    222     if compilation:
    223         for source in compilation.files:
    224             compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
    225             command = [compiler, '-c'] + compilation.flags + [source]
    226             logging.debug('formated as: %s', command)
    227             yield {
    228                 'directory': exec_trace['directory'],
    229                 'command': encode(command),
    230                 'file': abspath(exec_trace['directory'], source)
    231             }
    232 
    233 
    234 def is_preload_disabled(platform):
    235     """ Library-based interposition will fail silently if SIP is enabled,
    236     so this should be detected. You can detect whether SIP is enabled on
    237     Darwin by checking whether (1) there is a binary called 'csrutil' in
    238     the path and, if so, (2) whether the output of executing 'csrutil status'
    239     contains 'System Integrity Protection status: enabled'.
    240 
    241     Same problem on linux when SELinux is enabled. The status query program
    242     'sestatus' and the output when it's enabled 'SELinux status: enabled'. """
    243 
    244     if platform == 'darwin':
    245         pattern = re.compile(r'System Integrity Protection status:\s+enabled')
    246         command = ['csrutil', 'status']
    247     elif platform in {'linux', 'linux2'}:
    248         pattern = re.compile(r'SELinux status:\s+enabled')
    249         command = ['sestatus']
    250     else:
    251         return False
    252 
    253     try:
    254         lines = subprocess.check_output(command).decode('utf-8')
    255         return any((pattern.match(line) for line in lines.splitlines()))
    256     except:
    257         return False
    258 
    259 
    260 def entry_hash(entry):
    261     """ Implement unique hash method for compilation database entries. """
    262 
    263     # For faster lookup in set filename is reverted
    264     filename = entry['file'][::-1]
    265     # For faster lookup in set directory is reverted
    266     directory = entry['directory'][::-1]
    267     # On OS X the 'cc' and 'c++' compilers are wrappers for
    268     # 'clang' therefore both call would be logged. To avoid
    269     # this the hash does not contain the first word of the
    270     # command.
    271     command = ' '.join(decode(entry['command'])[1:])
    272 
    273     return '<>'.join([filename, directory, command])
    274 
    275 
    276 def create_parser():
    277     """ Command line argument parser factory method. """
    278 
    279     parser = argparse.ArgumentParser(
    280         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    281 
    282     parser.add_argument(
    283         '--verbose', '-v',
    284         action='count',
    285         default=0,
    286         help="""Enable verbose output from '%(prog)s'. A second and third
    287                 flag increases verbosity.""")
    288     parser.add_argument(
    289         '--cdb',
    290         metavar='<file>',
    291         default="compile_commands.json",
    292         help="""The JSON compilation database.""")
    293     group = parser.add_mutually_exclusive_group()
    294     group.add_argument(
    295         '--append',
    296         action='store_true',
    297         help="""Append new entries to existing compilation database.""")
    298     group.add_argument(
    299         '--disable-filter', '-n',
    300         dest='raw_entries',
    301         action='store_true',
    302         help="""Intercepted child process creation calls (exec calls) are all
    303                 logged to the output. The output is not a compilation database.
    304                 This flag is for debug purposes.""")
    305 
    306     advanced = parser.add_argument_group('advanced options')
    307     advanced.add_argument(
    308         '--override-compiler',
    309         action='store_true',
    310         help="""Always resort to the compiler wrapper even when better
    311                 intercept methods are available.""")
    312     advanced.add_argument(
    313         '--use-cc',
    314         metavar='<path>',
    315         dest='cc',
    316         default='cc',
    317         help="""When '%(prog)s' analyzes a project by interposing a compiler
    318                 wrapper, which executes a real compiler for compilation and
    319                 do other tasks (record the compiler invocation). Because of
    320                 this interposing, '%(prog)s' does not know what compiler your
    321                 project normally uses. Instead, it simply overrides the CC
    322                 environment variable, and guesses your default compiler.
    323 
    324                 If you need '%(prog)s' to use a specific compiler for
    325                 *compilation* then you can use this option to specify a path
    326                 to that compiler.""")
    327     advanced.add_argument(
    328         '--use-c++',
    329         metavar='<path>',
    330         dest='cxx',
    331         default='c++',
    332         help="""This is the same as "--use-cc" but for C++ code.""")
    333 
    334     parser.add_argument(
    335         dest='build',
    336         nargs=argparse.REMAINDER,
    337         help="""Command to run.""")
    338 
    339     return parser
    340