Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/env python
      2 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
      3 #
      4 #                     The LLVM Compiler Infrastructure
      5 #
      6 # This file is distributed under the University of Illinois Open Source
      7 # License. See LICENSE.TXT for details.
      8 #
      9 #===------------------------------------------------------------------------===#
     10 import argparse
     11 import bisect
     12 import getopt
     13 import os
     14 import re
     15 import subprocess
     16 import sys
     17 
     18 symbolizers = {}
     19 DEBUG = False
     20 demangle = False
     21 binutils_prefix = None
     22 sysroot_path = None
     23 binary_name_filter = None
     24 fix_filename_patterns = None
     25 logfile = sys.stdin
     26 allow_system_symbolizer = True
     27 
     28 # FIXME: merge the code that calls fix_filename().
     29 def fix_filename(file_name):
     30   if fix_filename_patterns:
     31     for path_to_cut in fix_filename_patterns:
     32       file_name = re.sub('.*' + path_to_cut, '', file_name)
     33   file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
     34   file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
     35   return file_name
     36 
     37 def sysroot_path_filter(binary_name):
     38   return sysroot_path + binary_name
     39 
     40 def guess_arch(addr):
     41   # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
     42   if len(addr) > 10:
     43     return 'x86_64'
     44   else:
     45     return 'i386'
     46 
     47 class Symbolizer(object):
     48   def __init__(self):
     49     pass
     50 
     51   def symbolize(self, addr, binary, offset):
     52     """Symbolize the given address (pair of binary and offset).
     53 
     54     Overriden in subclasses.
     55     Args:
     56         addr: virtual address of an instruction.
     57         binary: path to executable/shared object containing this instruction.
     58         offset: instruction offset in the @binary.
     59     Returns:
     60         list of strings (one string for each inlined frame) describing
     61         the code locations for this instruction (that is, function name, file
     62         name, line and column numbers).
     63     """
     64     return None
     65 
     66 
     67 class LLVMSymbolizer(Symbolizer):
     68   def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
     69     super(LLVMSymbolizer, self).__init__()
     70     self.symbolizer_path = symbolizer_path
     71     self.default_arch = default_arch
     72     self.system = system
     73     self.dsym_hints = dsym_hints
     74     self.pipe = self.open_llvm_symbolizer()
     75 
     76   def open_llvm_symbolizer(self):
     77     cmd = [self.symbolizer_path,
     78            '--use-symbol-table=true',
     79            '--demangle=%s' % demangle,
     80            '--functions=linkage',
     81            '--inlining=true',
     82            '--default-arch=%s' % self.default_arch]
     83     if self.system == 'Darwin':
     84       for hint in self.dsym_hints:
     85         cmd.append('--dsym-hint=%s' % hint)
     86     if DEBUG:
     87       print ' '.join(cmd)
     88     try:
     89       result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
     90                                 stdout=subprocess.PIPE)
     91     except OSError:
     92       result = None
     93     return result
     94 
     95   def symbolize(self, addr, binary, offset):
     96     """Overrides Symbolizer.symbolize."""
     97     if not self.pipe:
     98       return None
     99     result = []
    100     try:
    101       symbolizer_input = '"%s" %s' % (binary, offset)
    102       if DEBUG:
    103         print symbolizer_input
    104       print >> self.pipe.stdin, symbolizer_input
    105       while True:
    106         function_name = self.pipe.stdout.readline().rstrip()
    107         if not function_name:
    108           break
    109         file_name = self.pipe.stdout.readline().rstrip()
    110         file_name = fix_filename(file_name)
    111         if (not function_name.startswith('??') or
    112             not file_name.startswith('??')):
    113           # Append only non-trivial frames.
    114           result.append('%s in %s %s' % (addr, function_name,
    115                                          file_name))
    116     except Exception:
    117       result = []
    118     if not result:
    119       result = None
    120     return result
    121 
    122 
    123 def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
    124   symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
    125   if not symbolizer_path:
    126     symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
    127     if not symbolizer_path:
    128       # Assume llvm-symbolizer is in PATH.
    129       symbolizer_path = 'llvm-symbolizer'
    130   return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
    131 
    132 
    133 class Addr2LineSymbolizer(Symbolizer):
    134   def __init__(self, binary):
    135     super(Addr2LineSymbolizer, self).__init__()
    136     self.binary = binary
    137     self.pipe = self.open_addr2line()
    138     self.output_terminator = -1
    139 
    140   def open_addr2line(self):
    141     addr2line_tool = 'addr2line'
    142     if binutils_prefix:
    143       addr2line_tool = binutils_prefix + addr2line_tool
    144     cmd = [addr2line_tool, '-fi']
    145     if demangle:
    146       cmd += ['--demangle']
    147     cmd += ['-e', self.binary]
    148     if DEBUG:
    149       print ' '.join(cmd)
    150     return subprocess.Popen(cmd,
    151                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    152 
    153   def symbolize(self, addr, binary, offset):
    154     """Overrides Symbolizer.symbolize."""
    155     if self.binary != binary:
    156       return None
    157     lines = []
    158     try:
    159       print >> self.pipe.stdin, offset
    160       print >> self.pipe.stdin, self.output_terminator
    161       is_first_frame = True
    162       while True:
    163         function_name = self.pipe.stdout.readline().rstrip()
    164         file_name = self.pipe.stdout.readline().rstrip()
    165         if is_first_frame:
    166           is_first_frame = False
    167         elif function_name in ['', '??']:
    168           assert file_name == function_name
    169           break
    170         lines.append((function_name, file_name));
    171     except Exception:
    172       lines.append(('??', '??:0'))
    173     return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
    174 
    175 class UnbufferedLineConverter(object):
    176   """
    177   Wrap a child process that responds to each line of input with one line of
    178   output.  Uses pty to trick the child into providing unbuffered output.
    179   """
    180   def __init__(self, args, close_stderr=False):
    181     # Local imports so that the script can start on Windows.
    182     import pty
    183     import termios
    184     pid, fd = pty.fork()
    185     if pid == 0:
    186       # We're the child. Transfer control to command.
    187       if close_stderr:
    188         dev_null = os.open('/dev/null', 0)
    189         os.dup2(dev_null, 2)
    190       os.execvp(args[0], args)
    191     else:
    192       # Disable echoing.
    193       attr = termios.tcgetattr(fd)
    194       attr[3] = attr[3] & ~termios.ECHO
    195       termios.tcsetattr(fd, termios.TCSANOW, attr)
    196       # Set up a file()-like interface to the child process
    197       self.r = os.fdopen(fd, "r", 1)
    198       self.w = os.fdopen(os.dup(fd), "w", 1)
    199 
    200   def convert(self, line):
    201     self.w.write(line + "\n")
    202     return self.readline()
    203 
    204   def readline(self):
    205     return self.r.readline().rstrip()
    206 
    207 
    208 class DarwinSymbolizer(Symbolizer):
    209   def __init__(self, addr, binary):
    210     super(DarwinSymbolizer, self).__init__()
    211     self.binary = binary
    212     self.arch = guess_arch(addr)
    213     self.open_atos()
    214 
    215   def open_atos(self):
    216     if DEBUG:
    217       print 'atos -o %s -arch %s' % (self.binary, self.arch)
    218     cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
    219     self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
    220 
    221   def symbolize(self, addr, binary, offset):
    222     """Overrides Symbolizer.symbolize."""
    223     if self.binary != binary:
    224       return None
    225     atos_line = self.atos.convert('0x%x' % int(offset, 16))
    226     while "got symbolicator for" in atos_line:
    227       atos_line = self.atos.readline()
    228     # A well-formed atos response looks like this:
    229     #   foo(type1, type2) (in object.name) (filename.cc:80)
    230     match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
    231     if DEBUG:
    232       print 'atos_line: ', atos_line
    233     if match:
    234       function_name = match.group(1)
    235       function_name = re.sub('\(.*?\)', '', function_name)
    236       file_name = fix_filename(match.group(3))
    237       return ['%s in %s %s' % (addr, function_name, file_name)]
    238     else:
    239       return ['%s in %s' % (addr, atos_line)]
    240 
    241 
    242 # Chain several symbolizers so that if one symbolizer fails, we fall back
    243 # to the next symbolizer in chain.
    244 class ChainSymbolizer(Symbolizer):
    245   def __init__(self, symbolizer_list):
    246     super(ChainSymbolizer, self).__init__()
    247     self.symbolizer_list = symbolizer_list
    248 
    249   def symbolize(self, addr, binary, offset):
    250     """Overrides Symbolizer.symbolize."""
    251     for symbolizer in self.symbolizer_list:
    252       if symbolizer:
    253         result = symbolizer.symbolize(addr, binary, offset)
    254         if result:
    255           return result
    256     return None
    257 
    258   def append_symbolizer(self, symbolizer):
    259     self.symbolizer_list.append(symbolizer)
    260 
    261 
    262 def BreakpadSymbolizerFactory(binary):
    263   suffix = os.getenv('BREAKPAD_SUFFIX')
    264   if suffix:
    265     filename = binary + suffix
    266     if os.access(filename, os.F_OK):
    267       return BreakpadSymbolizer(filename)
    268   return None
    269 
    270 
    271 def SystemSymbolizerFactory(system, addr, binary):
    272   if system == 'Darwin':
    273     return DarwinSymbolizer(addr, binary)
    274   elif system == 'Linux':
    275     return Addr2LineSymbolizer(binary)
    276 
    277 
    278 class BreakpadSymbolizer(Symbolizer):
    279   def __init__(self, filename):
    280     super(BreakpadSymbolizer, self).__init__()
    281     self.filename = filename
    282     lines = file(filename).readlines()
    283     self.files = []
    284     self.symbols = {}
    285     self.address_list = []
    286     self.addresses = {}
    287     # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
    288     fragments = lines[0].rstrip().split()
    289     self.arch = fragments[2]
    290     self.debug_id = fragments[3]
    291     self.binary = ' '.join(fragments[4:])
    292     self.parse_lines(lines[1:])
    293 
    294   def parse_lines(self, lines):
    295     cur_function_addr = ''
    296     for line in lines:
    297       fragments = line.split()
    298       if fragments[0] == 'FILE':
    299         assert int(fragments[1]) == len(self.files)
    300         self.files.append(' '.join(fragments[2:]))
    301       elif fragments[0] == 'PUBLIC':
    302         self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
    303       elif fragments[0] in ['CFI', 'STACK']:
    304         pass
    305       elif fragments[0] == 'FUNC':
    306         cur_function_addr = int(fragments[1], 16)
    307         if not cur_function_addr in self.symbols.keys():
    308           self.symbols[cur_function_addr] = ' '.join(fragments[4:])
    309       else:
    310         # Line starting with an address.
    311         addr = int(fragments[0], 16)
    312         self.address_list.append(addr)
    313         # Tuple of symbol address, size, line, file number.
    314         self.addresses[addr] = (cur_function_addr,
    315                                 int(fragments[1], 16),
    316                                 int(fragments[2]),
    317                                 int(fragments[3]))
    318     self.address_list.sort()
    319 
    320   def get_sym_file_line(self, addr):
    321     key = None
    322     if addr in self.addresses.keys():
    323       key = addr
    324     else:
    325       index = bisect.bisect_left(self.address_list, addr)
    326       if index == 0:
    327         return None
    328       else:
    329         key = self.address_list[index - 1]
    330     sym_id, size, line_no, file_no = self.addresses[key]
    331     symbol = self.symbols[sym_id]
    332     filename = self.files[file_no]
    333     if addr < key + size:
    334       return symbol, filename, line_no
    335     else:
    336       return None
    337 
    338   def symbolize(self, addr, binary, offset):
    339     if self.binary != binary:
    340       return None
    341     res = self.get_sym_file_line(int(offset, 16))
    342     if res:
    343       function_name, file_name, line_no = res
    344       result = ['%s in %s %s:%d' % (
    345           addr, function_name, file_name, line_no)]
    346       print result
    347       return result
    348     else:
    349       return None
    350 
    351 
    352 class SymbolizationLoop(object):
    353   def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
    354     if sys.platform == 'win32':
    355       # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
    356       # even in sandboxed processes.  Nothing needs to be done here.
    357       self.process_line = self.process_line_echo
    358     else:
    359       # Used by clients who may want to supply a different binary name.
    360       # E.g. in Chrome several binaries may share a single .dSYM.
    361       self.binary_name_filter = binary_name_filter
    362       self.dsym_hint_producer = dsym_hint_producer
    363       self.system = os.uname()[0]
    364       if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
    365         raise Exception('Unknown system')
    366       self.llvm_symbolizers = {}
    367       self.last_llvm_symbolizer = None
    368       self.dsym_hints = set([])
    369       self.frame_no = 0
    370       self.process_line = self.process_line_posix
    371 
    372   def symbolize_address(self, addr, binary, offset):
    373     # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
    374     # a single symbolizer binary.
    375     # On Darwin, if the dsym hint producer is present:
    376     #  1. check whether we've seen this binary already; if so,
    377     #     use |llvm_symbolizers[binary]|, which has already loaded the debug
    378     #     info for this binary (might not be the case for
    379     #     |last_llvm_symbolizer|);
    380     #  2. otherwise check if we've seen all the hints for this binary already;
    381     #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
    382     #  3. otherwise create a new symbolizer and pass all currently known
    383     #     .dSYM hints to it.
    384     if not binary in self.llvm_symbolizers:
    385       use_new_symbolizer = True
    386       if self.system == 'Darwin' and self.dsym_hint_producer:
    387         dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
    388         use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
    389         self.dsym_hints |= dsym_hints_for_binary
    390       if self.last_llvm_symbolizer and not use_new_symbolizer:
    391           self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
    392       else:
    393         self.last_llvm_symbolizer = LLVMSymbolizerFactory(
    394             self.system, guess_arch(addr), self.dsym_hints)
    395         self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
    396     # Use the chain of symbolizers:
    397     # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
    398     # (fall back to next symbolizer if the previous one fails).
    399     if not binary in symbolizers:
    400       symbolizers[binary] = ChainSymbolizer(
    401           [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
    402     result = symbolizers[binary].symbolize(addr, binary, offset)
    403     if result is None:
    404       if not allow_system_symbolizer:
    405         raise Exception('Failed to launch or use llvm-symbolizer.')
    406       # Initialize system symbolizer only if other symbolizers failed.
    407       symbolizers[binary].append_symbolizer(
    408           SystemSymbolizerFactory(self.system, addr, binary))
    409       result = symbolizers[binary].symbolize(addr, binary, offset)
    410     # The system symbolizer must produce some result.
    411     assert result
    412     return result
    413 
    414   def get_symbolized_lines(self, symbolized_lines):
    415     if not symbolized_lines:
    416       return [self.current_line]
    417     else:
    418       result = []
    419       for symbolized_frame in symbolized_lines:
    420         result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
    421         self.frame_no += 1
    422       return result
    423 
    424   def process_logfile(self):
    425     self.frame_no = 0
    426     for line in logfile:
    427       processed = self.process_line(line)
    428       print '\n'.join(processed)
    429 
    430   def process_line_echo(self, line):
    431     return [line.rstrip()]
    432 
    433   def process_line_posix(self, line):
    434     self.current_line = line.rstrip()
    435     #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
    436     stack_trace_line_format = (
    437         '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
    438     match = re.match(stack_trace_line_format, line)
    439     if not match:
    440       return [self.current_line]
    441     if DEBUG:
    442       print line
    443     _, frameno_str, addr, binary, offset = match.groups()
    444     if frameno_str == '0':
    445       # Assume that frame #0 is the first frame of new stack trace.
    446       self.frame_no = 0
    447     original_binary = binary
    448     if self.binary_name_filter:
    449       binary = self.binary_name_filter(binary)
    450     symbolized_line = self.symbolize_address(addr, binary, offset)
    451     if not symbolized_line:
    452       if original_binary != binary:
    453         symbolized_line = self.symbolize_address(addr, binary, offset)
    454     return self.get_symbolized_lines(symbolized_line)
    455 
    456 
    457 if __name__ == '__main__':
    458   parser = argparse.ArgumentParser(
    459       formatter_class=argparse.RawDescriptionHelpFormatter,
    460       description='ASan symbolization script',
    461       epilog='Example of use:\n'
    462              'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
    463              '-s "$HOME/SymbolFiles" < asan.log')
    464   parser.add_argument('path_to_cut', nargs='*',
    465                       help='pattern to be cut from the result file path ')
    466   parser.add_argument('-d','--demangle', action='store_true',
    467                       help='demangle function names')
    468   parser.add_argument('-s', metavar='SYSROOT',
    469                       help='set path to sysroot for sanitized binaries')
    470   parser.add_argument('-c', metavar='CROSS_COMPILE',
    471                       help='set prefix for binutils')
    472   parser.add_argument('-l','--logfile', default=sys.stdin,
    473                       type=argparse.FileType('r'),
    474                       help='set log file name to parse, default is stdin')
    475   args = parser.parse_args()
    476   if args.path_to_cut:
    477     fix_filename_patterns = args.path_to_cut
    478   if args.demangle:
    479     demangle = True
    480   if args.s:
    481     binary_name_filter = sysroot_path_filter
    482     sysroot_path = args.s
    483   if args.c:
    484     binutils_prefix = args.c
    485   if args.logfile:
    486     logfile = args.logfile
    487   else:
    488     logfile = sys.stdin
    489   loop = SymbolizationLoop(binary_name_filter)
    490   loop.process_logfile()
    491