Home | History | Annotate | Download | only in mac
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """
      7 This script can take an Apple-style CrashReporter log and symbolicate it. This
      8 is useful for when a user's reports aren't being uploaded, for example.
      9 
     10 Only versions 6, 7, 8, and 9 reports are supported. For more information on the
     11 file format, reference this document:
     12   TN2123 <http://developer.apple.com/library/mac/#technotes/tn2004/tn2123.html>
     13 
     14 Information on symbolication was gleaned from:
     15   <http://developer.apple.com/tools/xcode/symbolizingcrashdumps.html>
     16 """
     17 
     18 import optparse
     19 import os.path
     20 import re
     21 import subprocess
     22 import sys
     23 
     24 # Maps binary image identifiers to binary names (minus the .dSYM portion) found
     25 # in the archive. These are the only objects that will be looked up.
     26 SYMBOL_IMAGE_MAP = {
     27   'com.google.Chrome': 'Google Chrome.app',
     28   'com.google.Chrome.framework': 'Google Chrome Framework.framework',
     29   'com.google.Chrome.helper': 'Google Chrome Helper.app'
     30 }
     31 
     32 class CrashReport(object):
     33   """A parsed representation of an Apple CrashReport text file."""
     34   def __init__(self, file_name):
     35     super(CrashReport, self).__init__()
     36     self.report_info = {}
     37     self.threads = []
     38     self._binary_images = {}
     39 
     40     fd = open(file_name, 'r')
     41     self._ParseHeader(fd)
     42 
     43     # Try and get the report version. If it's not a version we handle, abort.
     44     self.report_version = int(self.report_info['Report Version'])
     45     # Version 6: 10.5 and 10.6 crash report
     46     # Version 7: 10.6 spindump report
     47     # Version 8: 10.7 spindump report
     48     # Version 9: 10.7 crash report
     49     valid_versions = (6, 7, 8, 9)
     50     if self.report_version not in valid_versions:
     51       raise Exception("Only crash reports of versions %s are accepted." %
     52           str(valid_versions))
     53 
     54     # If this is a spindump (version 7 or 8 report), use a special parser. The
     55     # format is undocumented, but is similar to version 6. However, the spindump
     56     # report contains user and kernel stacks for every process on the system.
     57     if self.report_version == 7 or self.report_version == 8:
     58       self._ParseSpindumpStack(fd)
     59     else:
     60       self._ParseStack(fd)
     61 
     62     self._ParseBinaryImages(fd)
     63     fd.close()
     64 
     65   def Symbolicate(self, symbol_path):
     66     """Symbolicates a crash report stack trace."""
     67     # In order to be efficient, collect all the offsets that will be passed to
     68     # atos by the image name.
     69     offsets_by_image = self._CollectAddressesForImages(SYMBOL_IMAGE_MAP.keys())
     70 
     71     # For each image, run atos with the list of addresses.
     72     for image_name, addresses in offsets_by_image.items():
     73       # If this image was not loaded or is in no stacks, skip.
     74       if image_name not in self._binary_images or not len(addresses):
     75         continue
     76 
     77       # Combine the |image_name| and |symbol_path| into the path of the dSYM.
     78       dsym_file = self._GetDSymPath(symbol_path, image_name)
     79 
     80       # From the list of 2-Tuples of (frame, address), create a list of just
     81       # addresses.
     82       address_list = map(lambda x: x[1], addresses)
     83 
     84       # Look up the load address of the image.
     85       binary_base = self._binary_images[image_name][0]
     86 
     87       # This returns a list of just symbols. The indices will match up with the
     88       # list of |addresses|.
     89       symbol_names = self._RunAtos(binary_base, dsym_file, address_list)
     90       if not symbol_names:
     91         print 'Error loading symbols for ' + image_name
     92         continue
     93 
     94       # Attaches a list of symbol names to stack frames. This assumes that the
     95       # order of |addresses| has stayed the same as |symbol_names|.
     96       self._AddSymbolsToFrames(symbol_names, addresses)
     97 
     98   def _ParseHeader(self, fd):
     99     """Parses the header section of a crash report, which contains the OS and
    100     application version information."""
    101     # The header is made up of different sections, depending on the type of
    102     # report and the report version. Almost all have a format of a key and
    103     # value separated by a colon. Accumulate all of these artifacts into a
    104     # dictionary until the first thread stack is reached.
    105     thread_re = re.compile('^[ \t]*Thread ([a-f0-9]+)')
    106     line = ''
    107     while not thread_re.match(line):
    108       # Skip blank lines. There are typically three or four sections separated
    109       # by newlines in the header.
    110       line = line.strip()
    111       if line:
    112         parts = line.split(':', 1)
    113         # Certain lines in different report versions don't follow the key-value
    114         # format, so skip them.
    115         if len(parts) == 2:
    116           # There's a varying amount of space padding after the ':' to align all
    117           # the values; strip that.
    118           self.report_info[parts[0]] = parts[1].lstrip()
    119       line = fd.readline()
    120 
    121     # When this loop exits, the header has been read in full. However, the first
    122     # thread stack heading has been read past. Seek backwards from the current
    123     # position by the length of the line so that it is re-read when
    124     # _ParseStack() is entered.
    125     fd.seek(-len(line), os.SEEK_CUR)
    126 
    127   def _ParseStack(self, fd):
    128     """Parses the stack dump of a crash report and creates a list of threads
    129     and their stack traces."""
    130     # Compile a regex that matches the start of a thread stack. Note that this
    131     # must be specific to not include the thread state section, which comes
    132     # right after all the stack traces.
    133     line_re = re.compile('^Thread ([0-9]+)( Crashed)?:(.*)')
    134 
    135     # On entry into this function, the fd has been walked up to the "Thread 0"
    136     # line.
    137     line = fd.readline().rstrip()
    138     in_stack = False
    139     thread = None
    140     while line_re.match(line) or in_stack:
    141       # Check for start of the thread stack.
    142       matches = line_re.match(line)
    143 
    144       if not line.strip():
    145         # A blank line indicates a break in the thread stack.
    146         in_stack = False
    147       elif matches:
    148         # If this is the start of a thread stack, create the CrashThread.
    149         in_stack = True
    150         thread = CrashThread(matches.group(1))
    151         thread.name = matches.group(3)
    152         thread.did_crash = matches.group(2) != None
    153         self.threads.append(thread)
    154       else:
    155         # All other lines are stack frames.
    156         thread.stack.append(self._ParseStackFrame(line))
    157       # Read the next line.
    158       line = fd.readline()
    159 
    160   def _ParseStackFrame(self, line):
    161     """Takes in a single line of text and transforms it into a StackFrame."""
    162     frame = StackFrame(line)
    163 
    164     # A stack frame is in the format of:
    165     # |<frame-number> <binary-image> 0x<address> <symbol> <offset>|.
    166     regex = '^([0-9]+) +(.+)[ \t]+(0x[0-9a-f]+) (.*) \+ ([0-9]+)$'
    167     matches = re.match(regex, line)
    168     if matches is None:
    169       return frame
    170 
    171     # Create a stack frame with the information extracted from the regex.
    172     frame.frame_id = matches.group(1)
    173     frame.image = matches.group(2)
    174     frame.address = int(matches.group(3), 0)  # Convert HEX to an int.
    175     frame.original_symbol = matches.group(4)
    176     frame.offset = matches.group(5)
    177     frame.line = None
    178     return frame
    179 
    180   def _ParseSpindumpStack(self, fd):
    181     """Parses a spindump stack report. In this format, each thread stack has
    182     both a user and kernel trace. Only the user traces are symbolicated."""
    183 
    184     # The stack trace begins with the thread header, which is identified by a
    185     # HEX number. The thread names appear to be incorrect in spindumps.
    186     user_thread_re = re.compile('^  Thread ([0-9a-fx]+)')
    187 
    188     # When this method is called, the fd has been walked right up to the first
    189     # line.
    190     line = fd.readline()
    191     in_user_stack = False
    192     in_kernel_stack = False
    193     thread = None
    194     frame_id = 0
    195     while user_thread_re.match(line) or in_user_stack or in_kernel_stack:
    196       # Check for the start of a thread.
    197       matches = user_thread_re.match(line)
    198 
    199       if not line.strip():
    200         # A blank line indicates the start of a new thread. The blank line comes
    201         # after the kernel stack before a new thread header.
    202         in_kernel_stack = False
    203       elif matches:
    204         # This is the start of a thread header. The next line is the heading for
    205         # the user stack, followed by the actual trace.
    206         thread = CrashThread(matches.group(1))
    207         frame_id = 0
    208         self.threads.append(thread)
    209         in_user_stack = True
    210         line = fd.readline()  # Read past the 'User stack:' header.
    211       elif line.startswith('  Kernel stack:'):
    212         # The kernel stack header comes immediately after the last frame (really
    213         # the top frame) in the user stack, without a blank line.
    214         in_user_stack = False
    215         in_kernel_stack = True
    216       elif in_user_stack:
    217         # If this is a line while in the user stack, parse it as a stack frame.
    218         thread.stack.append(self._ParseSpindumpStackFrame(line))
    219       # Loop with the next line.
    220       line = fd.readline()
    221 
    222     # When the loop exits, the file has been read through the 'Binary images:'
    223     # header. Seek backwards so that _ParseBinaryImages() does the right thing.
    224     fd.seek(-len(line), os.SEEK_CUR)
    225 
    226   def _ParseSpindumpStackFrame(self, line):
    227     """Parses a spindump-style stackframe."""
    228     frame = StackFrame(line)
    229 
    230     # The format of the frame is either:
    231     # A: |<space><steps> <symbol> + <offset> (in <image-name>) [<address>]|
    232     # B: |<space><steps> ??? (in <image-name> + <offset>) [<address>]|
    233     regex_a = '^([ ]+[0-9]+) (.*) \+ ([0-9]+) \(in (.*)\) \[(0x[0-9a-f]+)\]'
    234     regex_b = '^([ ]+[0-9]+) \?\?\?( \(in (.*) \+ ([0-9]+)\))? \[(0x[0-9a-f]+)\]'
    235 
    236     # Create the stack frame with the information extracted from the regex.
    237     matches = re.match(regex_a, line)
    238     if matches:
    239       frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
    240       frame.original_symbol = matches.group(2)
    241       frame.offset = matches.group(3)
    242       frame.image = matches.group(4)
    243       frame.address = int(matches.group(5), 0)
    244       frame.line = None
    245       return frame
    246 
    247     # If pattern A didn't match (which it will most of the time), try B.
    248     matches = re.match(regex_b, line)
    249     if matches:
    250       frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
    251       frame.image = matches.group(3)
    252       frame.offset = matches.group(4)
    253       frame.address = int(matches.group(5), 0)
    254       frame.line = None
    255       return frame
    256 
    257     # Otherwise, this frame could not be matched and just use the raw input.
    258     frame.line = frame.line.strip()
    259     return frame
    260 
    261   def _ParseBinaryImages(self, fd):
    262     """Parses out the binary images section in order to get the load offset."""
    263     # The parser skips some sections, so advance until the "Binary Images"
    264     # header is reached.
    265     while not fd.readline().lstrip().startswith("Binary Images:"): pass
    266 
    267     # Create a regex to match the lines of format:
    268     # |0x<start> - 0x<end> <binary-image> <version> (<version>) <<UUID>> <path>|
    269     image_re = re.compile(
    270         '[ ]*(0x[0-9a-f]+) -[ \t]+(0x[0-9a-f]+) [+ ]([a-zA-Z0-9._\-]+)')
    271 
    272     # This section is in this format:
    273     # |<start address> - <end address> <image name>|.
    274     while True:
    275       line = fd.readline()
    276       if not line.strip():
    277         # End when a blank line is hit.
    278         return
    279       # Match the line to the regex.
    280       match = image_re.match(line)
    281       if match:
    282         # Store the offsets by image name so it can be referenced during
    283         # symbolication. These are hex numbers with leading '0x', so int() can
    284         # convert them to decimal if base=0.
    285         address_range = (int(match.group(1), 0), int(match.group(2), 0))
    286         self._binary_images[match.group(3)] = address_range
    287 
    288   def _CollectAddressesForImages(self, images):
    289     """Iterates all the threads and stack frames and all the stack frames that
    290     are in a list of binary |images|. The result is a dictionary, keyed by the
    291     image name that maps to a list of tuples. Each is a 2-Tuple of
    292     (stack_frame, address)"""
    293     # Create the collection and initialize it with empty lists for each image.
    294     collection = {}
    295     for image in images:
    296       collection[image] = []
    297 
    298     # Perform the iteration.
    299     for thread in self.threads:
    300       for frame in thread.stack:
    301         image_name = self._ImageForAddress(frame.address)
    302         if image_name in images:
    303           # Replace the image name in the frame in case it was elided.
    304           frame.image = image_name
    305           collection[frame.image].append((frame, frame.address))
    306 
    307     # Return the result.
    308     return collection
    309 
    310   def _ImageForAddress(self, address):
    311     """Given a PC address, returns the bundle identifier of the image in which
    312     the address resides."""
    313     for image_name, address_range in self._binary_images.items():
    314       if address >= address_range[0] and address <= address_range[1]:
    315         return image_name
    316     return None
    317 
    318   def _GetDSymPath(self, base_path, image_name):
    319     """Takes a base path for the symbols and an image name. It looks the name up
    320     in SYMBOL_IMAGE_MAP and creates a full path to the dSYM in the bundle."""
    321     image_file = SYMBOL_IMAGE_MAP[image_name]
    322     return os.path.join(base_path, image_file + '.dSYM', 'Contents',
    323         'Resources', 'DWARF',
    324         os.path.splitext(image_file)[0])  # Chop off the extension.
    325 
    326   def _RunAtos(self, load_address, dsym_file, addresses):
    327     """Runs the atos with the provided arguments. |addresses| is used as stdin.
    328     Returns a list of symbol information in the same order as |addresses|."""
    329     args = ['atos', '-l', str(load_address), '-o', dsym_file]
    330 
    331     # Get the arch type. This is of the format |X86 (Native)|.
    332     if 'Code Type' in self.report_info:
    333       arch = self.report_info['Code Type'].lower().split(' ')
    334       if len(arch) == 2:
    335         arch = arch[0]
    336         if arch == 'x86':
    337           # The crash report refers to i386 as x86, but atos doesn't know what
    338           # that is.
    339           arch = 'i386'
    340         args.extend(['-arch', arch])
    341 
    342     proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    343     addresses = map(hex, addresses)
    344     (stdout, stderr) = proc.communicate(' '.join(addresses))
    345     if proc.returncode:
    346       return None
    347     return stdout.rstrip().split('\n')
    348 
    349   def _AddSymbolsToFrames(self, symbols, address_tuples):
    350     """Takes a single value (the list) from _CollectAddressesForImages and does
    351     a smart-zip with the data returned by atos in |symbols|. Note that the
    352     indices must match for this to succeed."""
    353     if len(symbols) != len(address_tuples):
    354       print 'symbols do not match'
    355 
    356     # Each line of output from atos is in this format:
    357     # |<symbol> (in <image>) (<file>:<line>)|.
    358     line_regex = re.compile('(.+) \(in (.+)\) (\((.+):([0-9]+)\))?')
    359 
    360     # Zip the two data sets together.
    361     for i in range(len(symbols)):
    362       symbol_parts = line_regex.match(symbols[i])
    363       if not symbol_parts:
    364         continue  # Error.
    365       frame = address_tuples[i][0]
    366       frame.symbol = symbol_parts.group(1)
    367       frame.image = symbol_parts.group(2)
    368       frame.file_name = symbol_parts.group(4)
    369       frame.line_number = symbol_parts.group(5)
    370 
    371 
    372 class CrashThread(object):
    373   """A CrashThread represents a stacktrace of a single thread """
    374   def __init__(self, thread_id):
    375     super(CrashThread, self).__init__()
    376     self.thread_id = thread_id
    377     self.name = None
    378     self.did_crash = False
    379     self.stack = []
    380 
    381   def __repr__(self):
    382     name = ''
    383     if self.name:
    384       name = ': ' + self.name
    385     return 'Thread ' + self.thread_id + name + '\n' + \
    386         '\n'.join(map(str, self.stack))
    387 
    388 
    389 class StackFrame(object):
    390   """A StackFrame is owned by a CrashThread."""
    391   def __init__(self, line):
    392     super(StackFrame, self).__init__()
    393     # The original line. This will be set to None if symbolication was
    394     # successfuly.
    395     self.line = line
    396 
    397     self.frame_id = 0
    398     self.image = None
    399     self.address = 0x0
    400     self.original_symbol = None
    401     self.offset = 0x0
    402     # The following members are set after symbolication.
    403     self.symbol = None
    404     self.file_name = None
    405     self.line_number = 0
    406 
    407   def __repr__(self):
    408     # If symbolication failed, just use the original line.
    409     if self.line:
    410       return '  %s' % self.line
    411 
    412     # Use different location information depending on symbolicated data.
    413     location = None
    414     if self.file_name:
    415       location = ' - %s:%s' % (self.file_name, self.line_number)
    416     else:
    417       location = ' + %s' % self.offset
    418 
    419     # Same with the symbol information.
    420     symbol = self.original_symbol
    421     if self.symbol:
    422       symbol = self.symbol
    423 
    424     return '  %s\t0x%x\t[%s\t%s]\t%s' % (self.frame_id, self.address,
    425         self.image, location, symbol)
    426 
    427 
    428 def PrettyPrintReport(report):
    429   """Takes a crash report and prints it like the crash server would."""
    430   print 'Process    : ' + report.report_info['Process']
    431   print 'Version    : ' + report.report_info['Version']
    432   print 'Date       : ' + report.report_info['Date/Time']
    433   print 'OS Version : ' + report.report_info['OS Version']
    434   print
    435   if 'Crashed Thread' in report.report_info:
    436     print 'Crashed Thread : ' + report.report_info['Crashed Thread']
    437     print
    438   if 'Event' in report.report_info:
    439     print 'Event      : ' + report.report_info['Event']
    440     print
    441 
    442   for thread in report.threads:
    443     print
    444     if thread.did_crash:
    445       exc_type = report.report_info['Exception Type'].split(' ')[0]
    446       exc_code = report.report_info['Exception Codes'].replace('at', '@')
    447       print '*CRASHED* ( ' + exc_type + ' / ' + exc_code + ' )'
    448     # Version 7 reports have spindump-style output (with a stepped stack trace),
    449     # so remove the first tab to get better alignment.
    450     if report.report_version == 7:
    451       for line in repr(thread).split('\n'):
    452         print line.replace('\t', '  ', 1)
    453     else:
    454       print thread
    455 
    456 
    457 def Main(args):
    458   """Program main."""
    459   parser = optparse.OptionParser(
    460       usage='%prog [options] symbol_path crash_report',
    461       description='This will parse and symbolicate an Apple CrashReporter v6-9 '
    462           'file.')
    463   parser.add_option('-s', '--std-path', action='store_true', dest='std_path',
    464                     help='With this flag, the symbol_path is a containing '
    465                     'directory, in which a dSYM files are stored in a '
    466                     'directory named by the version. Example: '
    467                     '[symbolicate_crash.py -s ./symbols/ report.crash] will '
    468                     'look for dSYMs in ./symbols/15.0.666.0/ if the report is '
    469                     'from that verison.')
    470   (options, args) = parser.parse_args(args[1:])
    471 
    472   # Check that we have something to symbolicate.
    473   if len(args) != 2:
    474     parser.print_usage()
    475     return 1
    476 
    477   report = CrashReport(args[1])
    478   symbol_path = None
    479 
    480   # If not using the standard layout, this is a full path to the symbols.
    481   if not options.std_path:
    482     symbol_path = args[0]
    483   # Otherwise, use the report version to locate symbols in a directory.
    484   else:
    485     # This is in the format of |M.N.B.P (B.P)|. Get just the part before the
    486     # space.
    487     chrome_version = report.report_info['Version'].split(' ')[0]
    488     symbol_path = os.path.join(args[0], chrome_version)
    489 
    490   # Check that the symbols exist.
    491   if not os.path.isdir(symbol_path):
    492     print >>sys.stderr, 'Symbol path %s is not a directory' % symbol_path
    493     return 2
    494 
    495   print >>sys.stderr, 'Using symbols from ' + symbol_path
    496   print >>sys.stderr, '=' * 80
    497 
    498   report.Symbolicate(symbol_path)
    499   PrettyPrintReport(report)
    500   return 0
    501 
    502 
    503 if __name__ == '__main__':
    504   sys.exit(Main(sys.argv))
    505