Home | History | Annotate | Download | only in third_party
      1 #!/usr/bin/env python
      2 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
      3 #
      4 #                     The LLVM Compiler Infrastructure
      5 #
      6 # This file is distributed under the University of Illinois Open Source
      7 # License. See LICENSE.TXT for details.
      8 #
      9 #===------------------------------------------------------------------------===#
     10 import argparse
     11 import bisect
     12 import getopt
     13 import os
     14 import pty
     15 import re
     16 import subprocess
     17 import sys
     18 import termios
     19 
     20 symbolizers = {}
     21 DEBUG = False
     22 demangle = False
     23 binutils_prefix = None
     24 sysroot_path = None
     25 binary_name_filter = None
     26 fix_filename_patterns = None
     27 logfile = sys.stdin
     28 
     29 # FIXME: merge the code that calls fix_filename().
     30 def fix_filename(file_name):
     31   if fix_filename_patterns:
     32     for path_to_cut in fix_filename_patterns:
     33       file_name = re.sub('.*' + path_to_cut, '', file_name)
     34   file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
     35   file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
     36   return file_name
     37 
     38 def sysroot_path_filter(binary_name):
     39   return sysroot_path + binary_name
     40 
     41 def guess_arch(addr):
     42   # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
     43   if len(addr) > 10:
     44     return 'x86_64'
     45   else:
     46     return 'i386'
     47 
     48 class Symbolizer(object):
     49   def __init__(self):
     50     pass
     51 
     52   def symbolize(self, addr, binary, offset):
     53     """Symbolize the given address (pair of binary and offset).
     54 
     55     Overriden in subclasses.
     56     Args:
     57         addr: virtual address of an instruction.
     58         binary: path to executable/shared object containing this instruction.
     59         offset: instruction offset in the @binary.
     60     Returns:
     61         list of strings (one string for each inlined frame) describing
     62         the code locations for this instruction (that is, function name, file
     63         name, line and column numbers).
     64     """
     65     return None
     66 
     67 
     68 class LLVMSymbolizer(Symbolizer):
     69   def __init__(self, symbolizer_path, addr):
     70     super(LLVMSymbolizer, self).__init__()
     71     self.symbolizer_path = symbolizer_path
     72     self.default_arch = guess_arch(addr)
     73     self.pipe = self.open_llvm_symbolizer()
     74 
     75   def open_llvm_symbolizer(self):
     76     cmd = [self.symbolizer_path,
     77            '--use-symbol-table=true',
     78            '--demangle=%s' % demangle,
     79            '--functions=short',
     80            '--inlining=true',
     81            '--default-arch=%s' % self.default_arch]
     82     if DEBUG:
     83       print ' '.join(cmd)
     84     try:
     85       result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
     86                                 stdout=subprocess.PIPE)
     87     except OSError:
     88       result = None
     89     return result
     90 
     91   def symbolize(self, addr, binary, offset):
     92     """Overrides Symbolizer.symbolize."""
     93     if not self.pipe:
     94       return None
     95     result = []
     96     try:
     97       symbolizer_input = '%s %s' % (binary, offset)
     98       if DEBUG:
     99         print symbolizer_input
    100       print >> self.pipe.stdin, symbolizer_input
    101       while True:
    102         function_name = self.pipe.stdout.readline().rstrip()
    103         if not function_name:
    104           break
    105         file_name = self.pipe.stdout.readline().rstrip()
    106         file_name = fix_filename(file_name)
    107         if (not function_name.startswith('??') or
    108             not file_name.startswith('??')):
    109           # Append only non-trivial frames.
    110           result.append('%s in %s %s' % (addr, function_name,
    111                                          file_name))
    112     except Exception:
    113       result = []
    114     if not result:
    115       result = None
    116     return result
    117 
    118 
    119 def LLVMSymbolizerFactory(system, addr):
    120   symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
    121   if not symbolizer_path:
    122     symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
    123     if not symbolizer_path:
    124       # Assume llvm-symbolizer is in PATH.
    125       symbolizer_path = 'llvm-symbolizer'
    126   return LLVMSymbolizer(symbolizer_path, addr)
    127 
    128 
    129 class Addr2LineSymbolizer(Symbolizer):
    130   def __init__(self, binary):
    131     super(Addr2LineSymbolizer, self).__init__()
    132     self.binary = binary
    133     self.pipe = self.open_addr2line()
    134 
    135   def open_addr2line(self):
    136     addr2line_tool = 'addr2line'
    137     if binutils_prefix:
    138       addr2line_tool = binutils_prefix + addr2line_tool
    139     cmd = [addr2line_tool, '-f']
    140     if demangle:
    141       cmd += ['--demangle']
    142     cmd += ['-e', self.binary]
    143     if DEBUG:
    144       print ' '.join(cmd)
    145     return subprocess.Popen(cmd,
    146                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    147 
    148   def symbolize(self, addr, binary, offset):
    149     """Overrides Symbolizer.symbolize."""
    150     if self.binary != binary:
    151       return None
    152     try:
    153       print >> self.pipe.stdin, offset
    154       function_name = self.pipe.stdout.readline().rstrip()
    155       file_name = self.pipe.stdout.readline().rstrip()
    156     except Exception:
    157       function_name = ''
    158       file_name = ''
    159     file_name = fix_filename(file_name)
    160     return ['%s in %s %s' % (addr, function_name, file_name)]
    161 
    162 
    163 class UnbufferedLineConverter(object):
    164   """
    165   Wrap a child process that responds to each line of input with one line of
    166   output.  Uses pty to trick the child into providing unbuffered output.
    167   """
    168   def __init__(self, args, close_stderr=False):
    169     pid, fd = pty.fork()
    170     if pid == 0:
    171       # We're the child. Transfer control to command.
    172       if close_stderr:
    173         dev_null = os.open('/dev/null', 0)
    174         os.dup2(dev_null, 2)
    175       os.execvp(args[0], args)
    176     else:
    177       # Disable echoing.
    178       attr = termios.tcgetattr(fd)
    179       attr[3] = attr[3] & ~termios.ECHO
    180       termios.tcsetattr(fd, termios.TCSANOW, attr)
    181       # Set up a file()-like interface to the child process
    182       self.r = os.fdopen(fd, "r", 1)
    183       self.w = os.fdopen(os.dup(fd), "w", 1)
    184 
    185   def convert(self, line):
    186     self.w.write(line + "\n")
    187     return self.readline()
    188 
    189   def readline(self):
    190     return self.r.readline().rstrip()
    191 
    192 
    193 class DarwinSymbolizer(Symbolizer):
    194   def __init__(self, addr, binary):
    195     super(DarwinSymbolizer, self).__init__()
    196     self.binary = binary
    197     self.arch = guess_arch(addr)
    198     self.open_atos()
    199 
    200   def open_atos(self):
    201     if DEBUG:
    202       print 'atos -o %s -arch %s' % (self.binary, self.arch)
    203     cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
    204     self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
    205 
    206   def symbolize(self, addr, binary, offset):
    207     """Overrides Symbolizer.symbolize."""
    208     if self.binary != binary:
    209       return None
    210     atos_line = self.atos.convert('0x%x' % int(offset, 16))
    211     while "got symbolicator for" in atos_line:
    212       atos_line = self.atos.readline()
    213     # A well-formed atos response looks like this:
    214     #   foo(type1, type2) (in object.name) (filename.cc:80)
    215     match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
    216     if DEBUG:
    217       print 'atos_line: ', atos_line
    218     if match:
    219       function_name = match.group(1)
    220       function_name = re.sub('\(.*?\)', '', function_name)
    221       file_name = fix_filename(match.group(3))
    222       return ['%s in %s %s' % (addr, function_name, file_name)]
    223     else:
    224       return ['%s in %s' % (addr, atos_line)]
    225 
    226 
    227 # Chain several symbolizers so that if one symbolizer fails, we fall back
    228 # to the next symbolizer in chain.
    229 class ChainSymbolizer(Symbolizer):
    230   def __init__(self, symbolizer_list):
    231     super(ChainSymbolizer, self).__init__()
    232     self.symbolizer_list = symbolizer_list
    233 
    234   def symbolize(self, addr, binary, offset):
    235     """Overrides Symbolizer.symbolize."""
    236     for symbolizer in self.symbolizer_list:
    237       if symbolizer:
    238         result = symbolizer.symbolize(addr, binary, offset)
    239         if result:
    240           return result
    241     return None
    242 
    243   def append_symbolizer(self, symbolizer):
    244     self.symbolizer_list.append(symbolizer)
    245 
    246 
    247 def BreakpadSymbolizerFactory(binary):
    248   suffix = os.getenv('BREAKPAD_SUFFIX')
    249   if suffix:
    250     filename = binary + suffix
    251     if os.access(filename, os.F_OK):
    252       return BreakpadSymbolizer(filename)
    253   return None
    254 
    255 
    256 def SystemSymbolizerFactory(system, addr, binary):
    257   if system == 'Darwin':
    258     return DarwinSymbolizer(addr, binary)
    259   elif system == 'Linux':
    260     return Addr2LineSymbolizer(binary)
    261 
    262 
    263 class BreakpadSymbolizer(Symbolizer):
    264   def __init__(self, filename):
    265     super(BreakpadSymbolizer, self).__init__()
    266     self.filename = filename
    267     lines = file(filename).readlines()
    268     self.files = []
    269     self.symbols = {}
    270     self.address_list = []
    271     self.addresses = {}
    272     # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
    273     fragments = lines[0].rstrip().split()
    274     self.arch = fragments[2]
    275     self.debug_id = fragments[3]
    276     self.binary = ' '.join(fragments[4:])
    277     self.parse_lines(lines[1:])
    278 
    279   def parse_lines(self, lines):
    280     cur_function_addr = ''
    281     for line in lines:
    282       fragments = line.split()
    283       if fragments[0] == 'FILE':
    284         assert int(fragments[1]) == len(self.files)
    285         self.files.append(' '.join(fragments[2:]))
    286       elif fragments[0] == 'PUBLIC':
    287         self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
    288       elif fragments[0] in ['CFI', 'STACK']:
    289         pass
    290       elif fragments[0] == 'FUNC':
    291         cur_function_addr = int(fragments[1], 16)
    292         if not cur_function_addr in self.symbols.keys():
    293           self.symbols[cur_function_addr] = ' '.join(fragments[4:])
    294       else:
    295         # Line starting with an address.
    296         addr = int(fragments[0], 16)
    297         self.address_list.append(addr)
    298         # Tuple of symbol address, size, line, file number.
    299         self.addresses[addr] = (cur_function_addr,
    300                                 int(fragments[1], 16),
    301                                 int(fragments[2]),
    302                                 int(fragments[3]))
    303     self.address_list.sort()
    304 
    305   def get_sym_file_line(self, addr):
    306     key = None
    307     if addr in self.addresses.keys():
    308       key = addr
    309     else:
    310       index = bisect.bisect_left(self.address_list, addr)
    311       if index == 0:
    312         return None
    313       else:
    314         key = self.address_list[index - 1]
    315     sym_id, size, line_no, file_no = self.addresses[key]
    316     symbol = self.symbols[sym_id]
    317     filename = self.files[file_no]
    318     if addr < key + size:
    319       return symbol, filename, line_no
    320     else:
    321       return None
    322 
    323   def symbolize(self, addr, binary, offset):
    324     if self.binary != binary:
    325       return None
    326     res = self.get_sym_file_line(int(offset, 16))
    327     if res:
    328       function_name, file_name, line_no = res
    329       result = ['%s in %s %s:%d' % (
    330           addr, function_name, file_name, line_no)]
    331       print result
    332       return result
    333     else:
    334       return None
    335 
    336 
    337 class SymbolizationLoop(object):
    338   def __init__(self, binary_name_filter=None):
    339     # Used by clients who may want to supply a different binary name.
    340     # E.g. in Chrome several binaries may share a single .dSYM.
    341     self.binary_name_filter = binary_name_filter
    342     self.system = os.uname()[0]
    343     if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
    344       raise Exception('Unknown system')
    345     self.llvm_symbolizer = None
    346     self.frame_no = 0
    347 
    348   def symbolize_address(self, addr, binary, offset):
    349     # Initialize llvm-symbolizer lazily.
    350     if not self.llvm_symbolizer:
    351       self.llvm_symbolizer = LLVMSymbolizerFactory(self.system, addr)
    352     # Use the chain of symbolizers:
    353     # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
    354     # (fall back to next symbolizer if the previous one fails).
    355     if not binary in symbolizers:
    356       symbolizers[binary] = ChainSymbolizer(
    357           [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
    358     result = symbolizers[binary].symbolize(addr, binary, offset)
    359     if result is None:
    360       # Initialize system symbolizer only if other symbolizers failed.
    361       symbolizers[binary].append_symbolizer(
    362           SystemSymbolizerFactory(self.system, addr, binary))
    363       result = symbolizers[binary].symbolize(addr, binary, offset)
    364     # The system symbolizer must produce some result.
    365     assert result
    366     return result
    367 
    368   def get_symbolized_lines(self, symbolized_lines):
    369     if not symbolized_lines:
    370       return [self.current_line]
    371     else:
    372       result = []
    373       for symbolized_frame in symbolized_lines:
    374         result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
    375         self.frame_no += 1
    376       return result
    377 
    378   def process_logfile(self):
    379     self.frame_no = 0
    380     while True:
    381       line = logfile.readline()
    382       if not line:
    383         break
    384       processed = self.process_line(line)
    385       print '\n'.join(processed)
    386 
    387   def process_line(self, line):
    388     self.current_line = line.rstrip()
    389     #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
    390     stack_trace_line_format = (
    391         '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
    392     match = re.match(stack_trace_line_format, line)
    393     if not match:
    394       return [self.current_line]
    395     if DEBUG:
    396       print line
    397     _, frameno_str, addr, binary, offset = match.groups()
    398     if frameno_str == '0':
    399       # Assume that frame #0 is the first frame of new stack trace.
    400       self.frame_no = 0
    401     original_binary = binary
    402     if self.binary_name_filter:
    403       binary = self.binary_name_filter(binary)
    404     symbolized_line = self.symbolize_address(addr, binary, offset)
    405     if not symbolized_line:
    406       if original_binary != binary:
    407         symbolized_line = self.symbolize_address(addr, binary, offset)
    408     return self.get_symbolized_lines(symbolized_line)
    409 
    410 
    411 if __name__ == '__main__':
    412   parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
    413   description='ASan symbolization script',
    414   epilog='''Example of use:
    415   asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log''')
    416   parser.add_argument('path_to_cut', nargs='*',
    417     help='pattern to be cut from the result file path ')
    418   parser.add_argument('-d','--demangle', action='store_true',
    419     help='demangle function names')
    420   parser.add_argument('-s', metavar='SYSROOT',
    421     help='set path to sysroot for sanitized binaries')
    422   parser.add_argument('-c', metavar='CROSS_COMPILE',
    423     help='set prefix for binutils')
    424   parser.add_argument('-l','--logfile', default=sys.stdin, type=argparse.FileType('r'),
    425     help='set log file name to parse, default is stdin')
    426   args = parser.parse_args()
    427   if args.path_to_cut:
    428     fix_filename_patterns = args.path_to_cut
    429   if args.demangle:
    430     demangle = True
    431   if args.s:
    432     binary_name_filter = sysroot_path_filter
    433     sysroot_path = args.s
    434   if args.c:
    435     binutils_prefix = args.c
    436   if args.logfile:
    437     logfile = args.logfile
    438   else:
    439     logfile = sys.stdin
    440   loop = SymbolizationLoop(binary_name_filter)
    441   loop.process_logfile()
    442