Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/env python
      2 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
      3 #
      4 #                     The LLVM Compiler Infrastructure
      5 #
      6 # This file is distributed under the University of Illinois Open Source
      7 # License. See LICENSE.TXT for details.
      8 #
      9 #===------------------------------------------------------------------------===#
     10 import bisect
     11 import getopt
     12 import os
     13 import pty
     14 import re
     15 import subprocess
     16 import sys
     17 import termios
     18 
     19 symbolizers = {}
     20 DEBUG = False
     21 demangle = False;
     22 
     23 
     24 # FIXME: merge the code that calls fix_filename().
     25 def fix_filename(file_name):
     26   for path_to_cut in sys.argv[1:]:
     27     file_name = re.sub('.*' + path_to_cut, '', file_name)
     28   file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
     29   file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
     30   return file_name
     31 
     32 def GuessArch(addr):
     33   # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
     34   if len(addr) > 10:
     35     return 'x86_64'
     36   else:
     37     return 'i386'
     38 
     39 class Symbolizer(object):
     40   def __init__(self):
     41     pass
     42 
     43   def symbolize(self, addr, binary, offset):
     44     """Symbolize the given address (pair of binary and offset).
     45 
     46     Overriden in subclasses.
     47     Args:
     48         addr: virtual address of an instruction.
     49         binary: path to executable/shared object containing this instruction.
     50         offset: instruction offset in the @binary.
     51     Returns:
     52         list of strings (one string for each inlined frame) describing
     53         the code locations for this instruction (that is, function name, file
     54         name, line and column numbers).
     55     """
     56     return None
     57 
     58 
     59 class LLVMSymbolizer(Symbolizer):
     60   def __init__(self, symbolizer_path, addr):
     61     super(LLVMSymbolizer, self).__init__()
     62     self.symbolizer_path = symbolizer_path
     63     self.default_arch = GuessArch(addr)
     64     self.pipe = self.open_llvm_symbolizer()
     65 
     66   def open_llvm_symbolizer(self):
     67     cmd = [self.symbolizer_path,
     68            '--use-symbol-table=true',
     69            '--demangle=%s' % demangle,
     70            '--functions=short',
     71            '--inlining=true',
     72            '--default-arch=%s' % self.default_arch]
     73     if DEBUG:
     74       print ' '.join(cmd)
     75     try:
     76       result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
     77                                 stdout=subprocess.PIPE)
     78     except OSError:
     79       result = None
     80     return result
     81 
     82   def symbolize(self, addr, binary, offset):
     83     """Overrides Symbolizer.symbolize."""
     84     if not self.pipe:
     85       return None
     86     result = []
     87     try:
     88       symbolizer_input = '%s %s' % (binary, offset)
     89       if DEBUG:
     90         print symbolizer_input
     91       print >> self.pipe.stdin, symbolizer_input
     92       while True:
     93         function_name = self.pipe.stdout.readline().rstrip()
     94         if not function_name:
     95           break
     96         file_name = self.pipe.stdout.readline().rstrip()
     97         file_name = fix_filename(file_name)
     98         if (not function_name.startswith('??') or
     99             not file_name.startswith('??')):
    100           # Append only non-trivial frames.
    101           result.append('%s in %s %s' % (addr, function_name,
    102                                          file_name))
    103     except Exception:
    104       result = []
    105     if not result:
    106       result = None
    107     return result
    108 
    109 
    110 def LLVMSymbolizerFactory(system, addr):
    111   symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
    112   if not symbolizer_path:
    113     symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
    114     if not symbolizer_path:
    115       # Assume llvm-symbolizer is in PATH.
    116       symbolizer_path = 'llvm-symbolizer'
    117   return LLVMSymbolizer(symbolizer_path, addr)
    118 
    119 
    120 class Addr2LineSymbolizer(Symbolizer):
    121   def __init__(self, binary):
    122     super(Addr2LineSymbolizer, self).__init__()
    123     self.binary = binary
    124     self.pipe = self.open_addr2line()
    125 
    126   def open_addr2line(self):
    127     cmd = ['addr2line', '-f']
    128     if demangle:
    129       cmd += ['--demangle']
    130     cmd += ['-e', self.binary]
    131     if DEBUG:
    132       print ' '.join(cmd)
    133     return subprocess.Popen(cmd,
    134                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    135 
    136   def symbolize(self, addr, binary, offset):
    137     """Overrides Symbolizer.symbolize."""
    138     if self.binary != binary:
    139       return None
    140     try:
    141       print >> self.pipe.stdin, offset
    142       function_name = self.pipe.stdout.readline().rstrip()
    143       file_name = self.pipe.stdout.readline().rstrip()
    144     except Exception:
    145       function_name = ''
    146       file_name = ''
    147     file_name = fix_filename(file_name)
    148     return ['%s in %s %s' % (addr, function_name, file_name)]
    149 
    150 
    151 class UnbufferedLineConverter(object):
    152   """
    153   Wrap a child process that responds to each line of input with one line of
    154   output.  Uses pty to trick the child into providing unbuffered output.
    155   """
    156   def __init__(self, args, close_stderr=False):
    157     pid, fd = pty.fork()
    158     if pid == 0:
    159       # We're the child. Transfer control to command.
    160       if close_stderr:
    161         dev_null = os.open('/dev/null', 0)
    162         os.dup2(dev_null, 2)
    163       os.execvp(args[0], args)
    164     else:
    165       # Disable echoing.
    166       attr = termios.tcgetattr(fd)
    167       attr[3] = attr[3] & ~termios.ECHO
    168       termios.tcsetattr(fd, termios.TCSANOW, attr)
    169       # Set up a file()-like interface to the child process
    170       self.r = os.fdopen(fd, "r", 1)
    171       self.w = os.fdopen(os.dup(fd), "w", 1)
    172 
    173   def convert(self, line):
    174     self.w.write(line + "\n")
    175     return self.readline()
    176 
    177   def readline(self):
    178     return self.r.readline().rstrip()
    179 
    180 
    181 class DarwinSymbolizer(Symbolizer):
    182   def __init__(self, addr, binary):
    183     super(DarwinSymbolizer, self).__init__()
    184     self.binary = binary
    185     self.arch = GuessArch(addr)
    186     self.open_atos()
    187 
    188   def open_atos(self):
    189     if DEBUG:
    190       print 'atos -o %s -arch %s' % (self.binary, self.arch)
    191     cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
    192     self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
    193 
    194   def symbolize(self, addr, binary, offset):
    195     """Overrides Symbolizer.symbolize."""
    196     if self.binary != binary:
    197       return None
    198     atos_line = self.atos.convert('0x%x' % int(offset, 16))
    199     while "got symbolicator for" in atos_line:
    200       atos_line = self.atos.readline()
    201     # A well-formed atos response looks like this:
    202     #   foo(type1, type2) (in object.name) (filename.cc:80)
    203     match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
    204     if DEBUG:
    205       print 'atos_line: ', atos_line
    206     if match:
    207       function_name = match.group(1)
    208       function_name = re.sub('\(.*?\)', '', function_name)
    209       file_name = fix_filename(match.group(3))
    210       return ['%s in %s %s' % (addr, function_name, file_name)]
    211     else:
    212       return ['%s in %s' % (addr, atos_line)]
    213 
    214 
    215 # Chain several symbolizers so that if one symbolizer fails, we fall back
    216 # to the next symbolizer in chain.
    217 class ChainSymbolizer(Symbolizer):
    218   def __init__(self, symbolizer_list):
    219     super(ChainSymbolizer, self).__init__()
    220     self.symbolizer_list = symbolizer_list
    221 
    222   def symbolize(self, addr, binary, offset):
    223     """Overrides Symbolizer.symbolize."""
    224     for symbolizer in self.symbolizer_list:
    225       if symbolizer:
    226         result = symbolizer.symbolize(addr, binary, offset)
    227         if result:
    228           return result
    229     return None
    230 
    231   def append_symbolizer(self, symbolizer):
    232     self.symbolizer_list.append(symbolizer)
    233 
    234 
    235 def BreakpadSymbolizerFactory(binary):
    236   suffix = os.getenv('BREAKPAD_SUFFIX')
    237   if suffix:
    238     filename = binary + suffix
    239     if os.access(filename, os.F_OK):
    240       return BreakpadSymbolizer(filename)
    241   return None
    242 
    243 
    244 def SystemSymbolizerFactory(system, addr, binary):
    245   if system == 'Darwin':
    246     return DarwinSymbolizer(addr, binary)
    247   elif system == 'Linux':
    248     return Addr2LineSymbolizer(binary)
    249 
    250 
    251 class BreakpadSymbolizer(Symbolizer):
    252   def __init__(self, filename):
    253     super(BreakpadSymbolizer, self).__init__()
    254     self.filename = filename
    255     lines = file(filename).readlines()
    256     self.files = []
    257     self.symbols = {}
    258     self.address_list = []
    259     self.addresses = {}
    260     # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
    261     fragments = lines[0].rstrip().split()
    262     self.arch = fragments[2]
    263     self.debug_id = fragments[3]
    264     self.binary = ' '.join(fragments[4:])
    265     self.parse_lines(lines[1:])
    266 
    267   def parse_lines(self, lines):
    268     cur_function_addr = ''
    269     for line in lines:
    270       fragments = line.split()
    271       if fragments[0] == 'FILE':
    272         assert int(fragments[1]) == len(self.files)
    273         self.files.append(' '.join(fragments[2:]))
    274       elif fragments[0] == 'PUBLIC':
    275         self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
    276       elif fragments[0] in ['CFI', 'STACK']:
    277         pass
    278       elif fragments[0] == 'FUNC':
    279         cur_function_addr = int(fragments[1], 16)
    280         if not cur_function_addr in self.symbols.keys():
    281           self.symbols[cur_function_addr] = ' '.join(fragments[4:])
    282       else:
    283         # Line starting with an address.
    284         addr = int(fragments[0], 16)
    285         self.address_list.append(addr)
    286         # Tuple of symbol address, size, line, file number.
    287         self.addresses[addr] = (cur_function_addr,
    288                                 int(fragments[1], 16),
    289                                 int(fragments[2]),
    290                                 int(fragments[3]))
    291     self.address_list.sort()
    292 
    293   def get_sym_file_line(self, addr):
    294     key = None
    295     if addr in self.addresses.keys():
    296       key = addr
    297     else:
    298       index = bisect.bisect_left(self.address_list, addr)
    299       if index == 0:
    300         return None
    301       else:
    302         key = self.address_list[index - 1]
    303     sym_id, size, line_no, file_no = self.addresses[key]
    304     symbol = self.symbols[sym_id]
    305     filename = self.files[file_no]
    306     if addr < key + size:
    307       return symbol, filename, line_no
    308     else:
    309       return None
    310 
    311   def symbolize(self, addr, binary, offset):
    312     if self.binary != binary:
    313       return None
    314     res = self.get_sym_file_line(int(offset, 16))
    315     if res:
    316       function_name, file_name, line_no = res
    317       result = ['%s in %s %s:%d' % (
    318           addr, function_name, file_name, line_no)]
    319       print result
    320       return result
    321     else:
    322       return None
    323 
    324 
    325 class SymbolizationLoop(object):
    326   def __init__(self, binary_name_filter=None):
    327     # Used by clients who may want to supply a different binary name.
    328     # E.g. in Chrome several binaries may share a single .dSYM.
    329     self.binary_name_filter = binary_name_filter
    330     self.system = os.uname()[0]
    331     if self.system not in ['Linux', 'Darwin']:
    332       raise Exception('Unknown system')
    333     self.llvm_symbolizer = None
    334 
    335   def symbolize_address(self, addr, binary, offset):
    336     # Initialize llvm-symbolizer lazily.
    337     if not self.llvm_symbolizer:
    338       self.llvm_symbolizer = LLVMSymbolizerFactory(self.system, addr)
    339     # Use the chain of symbolizers:
    340     # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
    341     # (fall back to next symbolizer if the previous one fails).
    342     if not binary in symbolizers:
    343       symbolizers[binary] = ChainSymbolizer(
    344           [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer])
    345     result = symbolizers[binary].symbolize(addr, binary, offset)
    346     if result is None:
    347       # Initialize system symbolizer only if other symbolizers failed.
    348       symbolizers[binary].append_symbolizer(
    349           SystemSymbolizerFactory(self.system, addr, binary))
    350       result = symbolizers[binary].symbolize(addr, binary, offset)
    351     # The system symbolizer must produce some result.
    352     assert result
    353     return result
    354 
    355   def print_symbolized_lines(self, symbolized_lines):
    356     if not symbolized_lines:
    357       print self.current_line
    358     else:
    359       for symbolized_frame in symbolized_lines:
    360         print '    #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()
    361         self.frame_no += 1
    362 
    363   def process_stdin(self):
    364     self.frame_no = 0
    365     while True:
    366       line = sys.stdin.readline()
    367       if not line:
    368         break
    369       self.current_line = line.rstrip()
    370       #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
    371       stack_trace_line_format = (
    372           '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
    373       match = re.match(stack_trace_line_format, line)
    374       if not match:
    375         print self.current_line
    376         continue
    377       if DEBUG:
    378         print line
    379       _, frameno_str, addr, binary, offset = match.groups()
    380       if frameno_str == '0':
    381         # Assume that frame #0 is the first frame of new stack trace.
    382         self.frame_no = 0
    383       original_binary = binary
    384       if self.binary_name_filter:
    385         binary = self.binary_name_filter(binary)
    386       symbolized_line = self.symbolize_address(addr, binary, offset)
    387       if not symbolized_line:
    388         if original_binary != binary:
    389           symbolized_line = self.symbolize_address(addr, binary, offset)
    390       self.print_symbolized_lines(symbolized_line)
    391 
    392 
    393 if __name__ == '__main__':
    394   opts, args = getopt.getopt(sys.argv[1:], "d", ["demangle"])
    395   for o, a in opts:
    396     if o in ("-d", "--demangle"):
    397       demangle = True;
    398   loop = SymbolizationLoop()
    399   loop.process_stdin()
    400