1 #!/usr/bin/env python 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """ 7 This script can take an Apple-style CrashReporter log and symbolicate it. This 8 is useful for when a user's reports aren't being uploaded, for example. 9 10 Only versions 6, 7, 8, and 9 reports are supported. For more information on the 11 file format, reference this document: 12 TN2123 <http://developer.apple.com/library/mac/#technotes/tn2004/tn2123.html> 13 14 Information on symbolication was gleaned from: 15 <http://developer.apple.com/tools/xcode/symbolizingcrashdumps.html> 16 """ 17 18 import optparse 19 import os.path 20 import re 21 import subprocess 22 import sys 23 24 # Maps binary image identifiers to binary names (minus the .dSYM portion) found 25 # in the archive. These are the only objects that will be looked up. 26 SYMBOL_IMAGE_MAP = { 27 'com.google.Chrome': 'Google Chrome.app', 28 'com.google.Chrome.framework': 'Google Chrome Framework.framework', 29 'com.google.Chrome.helper': 'Google Chrome Helper.app' 30 } 31 32 class CrashReport(object): 33 """A parsed representation of an Apple CrashReport text file.""" 34 def __init__(self, file_name): 35 super(CrashReport, self).__init__() 36 self.report_info = {} 37 self.threads = [] 38 self._binary_images = {} 39 40 fd = open(file_name, 'r') 41 self._ParseHeader(fd) 42 43 # Try and get the report version. If it's not a version we handle, abort. 44 self.report_version = int(self.report_info['Report Version']) 45 # Version 6: 10.5 and 10.6 crash report 46 # Version 7: 10.6 spindump report 47 # Version 8: 10.7 spindump report 48 # Version 9: 10.7 crash report 49 valid_versions = (6, 7, 8, 9) 50 if self.report_version not in valid_versions: 51 raise Exception("Only crash reports of versions %s are accepted." % 52 str(valid_versions)) 53 54 # If this is a spindump (version 7 or 8 report), use a special parser. The 55 # format is undocumented, but is similar to version 6. However, the spindump 56 # report contains user and kernel stacks for every process on the system. 57 if self.report_version == 7 or self.report_version == 8: 58 self._ParseSpindumpStack(fd) 59 else: 60 self._ParseStack(fd) 61 62 self._ParseBinaryImages(fd) 63 fd.close() 64 65 def Symbolicate(self, symbol_path): 66 """Symbolicates a crash report stack trace.""" 67 # In order to be efficient, collect all the offsets that will be passed to 68 # atos by the image name. 69 offsets_by_image = self._CollectAddressesForImages(SYMBOL_IMAGE_MAP.keys()) 70 71 # For each image, run atos with the list of addresses. 72 for image_name, addresses in offsets_by_image.items(): 73 # If this image was not loaded or is in no stacks, skip. 74 if image_name not in self._binary_images or not len(addresses): 75 continue 76 77 # Combine the |image_name| and |symbol_path| into the path of the dSYM. 78 dsym_file = self._GetDSymPath(symbol_path, image_name) 79 80 # From the list of 2-Tuples of (frame, address), create a list of just 81 # addresses. 82 address_list = map(lambda x: x[1], addresses) 83 84 # Look up the load address of the image. 85 binary_base = self._binary_images[image_name][0] 86 87 # This returns a list of just symbols. The indices will match up with the 88 # list of |addresses|. 89 symbol_names = self._RunAtos(binary_base, dsym_file, address_list) 90 if not symbol_names: 91 print 'Error loading symbols for ' + image_name 92 continue 93 94 # Attaches a list of symbol names to stack frames. This assumes that the 95 # order of |addresses| has stayed the same as |symbol_names|. 96 self._AddSymbolsToFrames(symbol_names, addresses) 97 98 def _ParseHeader(self, fd): 99 """Parses the header section of a crash report, which contains the OS and 100 application version information.""" 101 # The header is made up of different sections, depending on the type of 102 # report and the report version. Almost all have a format of a key and 103 # value separated by a colon. Accumulate all of these artifacts into a 104 # dictionary until the first thread stack is reached. 105 thread_re = re.compile('^[ \t]*Thread ([a-f0-9]+)') 106 line = '' 107 while not thread_re.match(line): 108 # Skip blank lines. There are typically three or four sections separated 109 # by newlines in the header. 110 line = line.strip() 111 if line: 112 parts = line.split(':', 1) 113 # Certain lines in different report versions don't follow the key-value 114 # format, so skip them. 115 if len(parts) == 2: 116 # There's a varying amount of space padding after the ':' to align all 117 # the values; strip that. 118 self.report_info[parts[0]] = parts[1].lstrip() 119 line = fd.readline() 120 121 # When this loop exits, the header has been read in full. However, the first 122 # thread stack heading has been read past. Seek backwards from the current 123 # position by the length of the line so that it is re-read when 124 # _ParseStack() is entered. 125 fd.seek(-len(line), os.SEEK_CUR) 126 127 def _ParseStack(self, fd): 128 """Parses the stack dump of a crash report and creates a list of threads 129 and their stack traces.""" 130 # Compile a regex that matches the start of a thread stack. Note that this 131 # must be specific to not include the thread state section, which comes 132 # right after all the stack traces. 133 line_re = re.compile('^Thread ([0-9]+)( Crashed)?:(.*)') 134 135 # On entry into this function, the fd has been walked up to the "Thread 0" 136 # line. 137 line = fd.readline().rstrip() 138 in_stack = False 139 thread = None 140 while line_re.match(line) or in_stack: 141 # Check for start of the thread stack. 142 matches = line_re.match(line) 143 144 if not line.strip(): 145 # A blank line indicates a break in the thread stack. 146 in_stack = False 147 elif matches: 148 # If this is the start of a thread stack, create the CrashThread. 149 in_stack = True 150 thread = CrashThread(matches.group(1)) 151 thread.name = matches.group(3) 152 thread.did_crash = matches.group(2) != None 153 self.threads.append(thread) 154 else: 155 # All other lines are stack frames. 156 thread.stack.append(self._ParseStackFrame(line)) 157 # Read the next line. 158 line = fd.readline() 159 160 def _ParseStackFrame(self, line): 161 """Takes in a single line of text and transforms it into a StackFrame.""" 162 frame = StackFrame(line) 163 164 # A stack frame is in the format of: 165 # |<frame-number> <binary-image> 0x<address> <symbol> <offset>|. 166 regex = '^([0-9]+) +(.+)[ \t]+(0x[0-9a-f]+) (.*) \+ ([0-9]+)$' 167 matches = re.match(regex, line) 168 if matches is None: 169 return frame 170 171 # Create a stack frame with the information extracted from the regex. 172 frame.frame_id = matches.group(1) 173 frame.image = matches.group(2) 174 frame.address = int(matches.group(3), 0) # Convert HEX to an int. 175 frame.original_symbol = matches.group(4) 176 frame.offset = matches.group(5) 177 frame.line = None 178 return frame 179 180 def _ParseSpindumpStack(self, fd): 181 """Parses a spindump stack report. In this format, each thread stack has 182 both a user and kernel trace. Only the user traces are symbolicated.""" 183 184 # The stack trace begins with the thread header, which is identified by a 185 # HEX number. The thread names appear to be incorrect in spindumps. 186 user_thread_re = re.compile('^ Thread ([0-9a-fx]+)') 187 188 # When this method is called, the fd has been walked right up to the first 189 # line. 190 line = fd.readline() 191 in_user_stack = False 192 in_kernel_stack = False 193 thread = None 194 frame_id = 0 195 while user_thread_re.match(line) or in_user_stack or in_kernel_stack: 196 # Check for the start of a thread. 197 matches = user_thread_re.match(line) 198 199 if not line.strip(): 200 # A blank line indicates the start of a new thread. The blank line comes 201 # after the kernel stack before a new thread header. 202 in_kernel_stack = False 203 elif matches: 204 # This is the start of a thread header. The next line is the heading for 205 # the user stack, followed by the actual trace. 206 thread = CrashThread(matches.group(1)) 207 frame_id = 0 208 self.threads.append(thread) 209 in_user_stack = True 210 line = fd.readline() # Read past the 'User stack:' header. 211 elif line.startswith(' Kernel stack:'): 212 # The kernel stack header comes immediately after the last frame (really 213 # the top frame) in the user stack, without a blank line. 214 in_user_stack = False 215 in_kernel_stack = True 216 elif in_user_stack: 217 # If this is a line while in the user stack, parse it as a stack frame. 218 thread.stack.append(self._ParseSpindumpStackFrame(line)) 219 # Loop with the next line. 220 line = fd.readline() 221 222 # When the loop exits, the file has been read through the 'Binary images:' 223 # header. Seek backwards so that _ParseBinaryImages() does the right thing. 224 fd.seek(-len(line), os.SEEK_CUR) 225 226 def _ParseSpindumpStackFrame(self, line): 227 """Parses a spindump-style stackframe.""" 228 frame = StackFrame(line) 229 230 # The format of the frame is either: 231 # A: |<space><steps> <symbol> + <offset> (in <image-name>) [<address>]| 232 # B: |<space><steps> ??? (in <image-name> + <offset>) [<address>]| 233 regex_a = '^([ ]+[0-9]+) (.*) \+ ([0-9]+) \(in (.*)\) \[(0x[0-9a-f]+)\]' 234 regex_b = '^([ ]+[0-9]+) \?\?\?( \(in (.*) \+ ([0-9]+)\))? \[(0x[0-9a-f]+)\]' 235 236 # Create the stack frame with the information extracted from the regex. 237 matches = re.match(regex_a, line) 238 if matches: 239 frame.frame_id = matches.group(1)[4:] # Remove some leading spaces. 240 frame.original_symbol = matches.group(2) 241 frame.offset = matches.group(3) 242 frame.image = matches.group(4) 243 frame.address = int(matches.group(5), 0) 244 frame.line = None 245 return frame 246 247 # If pattern A didn't match (which it will most of the time), try B. 248 matches = re.match(regex_b, line) 249 if matches: 250 frame.frame_id = matches.group(1)[4:] # Remove some leading spaces. 251 frame.image = matches.group(3) 252 frame.offset = matches.group(4) 253 frame.address = int(matches.group(5), 0) 254 frame.line = None 255 return frame 256 257 # Otherwise, this frame could not be matched and just use the raw input. 258 frame.line = frame.line.strip() 259 return frame 260 261 def _ParseBinaryImages(self, fd): 262 """Parses out the binary images section in order to get the load offset.""" 263 # The parser skips some sections, so advance until the "Binary Images" 264 # header is reached. 265 while not fd.readline().lstrip().startswith("Binary Images:"): pass 266 267 # Create a regex to match the lines of format: 268 # |0x<start> - 0x<end> <binary-image> <version> (<version>) <<UUID>> <path>| 269 image_re = re.compile( 270 '[ ]*(0x[0-9a-f]+) -[ \t]+(0x[0-9a-f]+) [+ ]([a-zA-Z0-9._\-]+)') 271 272 # This section is in this format: 273 # |<start address> - <end address> <image name>|. 274 while True: 275 line = fd.readline() 276 if not line.strip(): 277 # End when a blank line is hit. 278 return 279 # Match the line to the regex. 280 match = image_re.match(line) 281 if match: 282 # Store the offsets by image name so it can be referenced during 283 # symbolication. These are hex numbers with leading '0x', so int() can 284 # convert them to decimal if base=0. 285 address_range = (int(match.group(1), 0), int(match.group(2), 0)) 286 self._binary_images[match.group(3)] = address_range 287 288 def _CollectAddressesForImages(self, images): 289 """Iterates all the threads and stack frames and all the stack frames that 290 are in a list of binary |images|. The result is a dictionary, keyed by the 291 image name that maps to a list of tuples. Each is a 2-Tuple of 292 (stack_frame, address)""" 293 # Create the collection and initialize it with empty lists for each image. 294 collection = {} 295 for image in images: 296 collection[image] = [] 297 298 # Perform the iteration. 299 for thread in self.threads: 300 for frame in thread.stack: 301 image_name = self._ImageForAddress(frame.address) 302 if image_name in images: 303 # Replace the image name in the frame in case it was elided. 304 frame.image = image_name 305 collection[frame.image].append((frame, frame.address)) 306 307 # Return the result. 308 return collection 309 310 def _ImageForAddress(self, address): 311 """Given a PC address, returns the bundle identifier of the image in which 312 the address resides.""" 313 for image_name, address_range in self._binary_images.items(): 314 if address >= address_range[0] and address <= address_range[1]: 315 return image_name 316 return None 317 318 def _GetDSymPath(self, base_path, image_name): 319 """Takes a base path for the symbols and an image name. It looks the name up 320 in SYMBOL_IMAGE_MAP and creates a full path to the dSYM in the bundle.""" 321 image_file = SYMBOL_IMAGE_MAP[image_name] 322 return os.path.join(base_path, image_file + '.dSYM', 'Contents', 323 'Resources', 'DWARF', 324 os.path.splitext(image_file)[0]) # Chop off the extension. 325 326 def _RunAtos(self, load_address, dsym_file, addresses): 327 """Runs the atos with the provided arguments. |addresses| is used as stdin. 328 Returns a list of symbol information in the same order as |addresses|.""" 329 args = ['atos', '-l', str(load_address), '-o', dsym_file] 330 331 # Get the arch type. This is of the format |X86 (Native)|. 332 if 'Code Type' in self.report_info: 333 arch = self.report_info['Code Type'].lower().split(' ') 334 if len(arch) == 2: 335 arch = arch[0] 336 if arch == 'x86': 337 # The crash report refers to i386 as x86, but atos doesn't know what 338 # that is. 339 arch = 'i386' 340 args.extend(['-arch', arch]) 341 342 proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 343 addresses = map(hex, addresses) 344 (stdout, stderr) = proc.communicate(' '.join(addresses)) 345 if proc.returncode: 346 return None 347 return stdout.rstrip().split('\n') 348 349 def _AddSymbolsToFrames(self, symbols, address_tuples): 350 """Takes a single value (the list) from _CollectAddressesForImages and does 351 a smart-zip with the data returned by atos in |symbols|. Note that the 352 indices must match for this to succeed.""" 353 if len(symbols) != len(address_tuples): 354 print 'symbols do not match' 355 356 # Each line of output from atos is in this format: 357 # |<symbol> (in <image>) (<file>:<line>)|. 358 line_regex = re.compile('(.+) \(in (.+)\) (\((.+):([0-9]+)\))?') 359 360 # Zip the two data sets together. 361 for i in range(len(symbols)): 362 symbol_parts = line_regex.match(symbols[i]) 363 if not symbol_parts: 364 continue # Error. 365 frame = address_tuples[i][0] 366 frame.symbol = symbol_parts.group(1) 367 frame.image = symbol_parts.group(2) 368 frame.file_name = symbol_parts.group(4) 369 frame.line_number = symbol_parts.group(5) 370 371 372 class CrashThread(object): 373 """A CrashThread represents a stacktrace of a single thread """ 374 def __init__(self, thread_id): 375 super(CrashThread, self).__init__() 376 self.thread_id = thread_id 377 self.name = None 378 self.did_crash = False 379 self.stack = [] 380 381 def __repr__(self): 382 name = '' 383 if self.name: 384 name = ': ' + self.name 385 return 'Thread ' + self.thread_id + name + '\n' + \ 386 '\n'.join(map(str, self.stack)) 387 388 389 class StackFrame(object): 390 """A StackFrame is owned by a CrashThread.""" 391 def __init__(self, line): 392 super(StackFrame, self).__init__() 393 # The original line. This will be set to None if symbolication was 394 # successfuly. 395 self.line = line 396 397 self.frame_id = 0 398 self.image = None 399 self.address = 0x0 400 self.original_symbol = None 401 self.offset = 0x0 402 # The following members are set after symbolication. 403 self.symbol = None 404 self.file_name = None 405 self.line_number = 0 406 407 def __repr__(self): 408 # If symbolication failed, just use the original line. 409 if self.line: 410 return ' %s' % self.line 411 412 # Use different location information depending on symbolicated data. 413 location = None 414 if self.file_name: 415 location = ' - %s:%s' % (self.file_name, self.line_number) 416 else: 417 location = ' + %s' % self.offset 418 419 # Same with the symbol information. 420 symbol = self.original_symbol 421 if self.symbol: 422 symbol = self.symbol 423 424 return ' %s\t0x%x\t[%s\t%s]\t%s' % (self.frame_id, self.address, 425 self.image, location, symbol) 426 427 428 def PrettyPrintReport(report): 429 """Takes a crash report and prints it like the crash server would.""" 430 print 'Process : ' + report.report_info['Process'] 431 print 'Version : ' + report.report_info['Version'] 432 print 'Date : ' + report.report_info['Date/Time'] 433 print 'OS Version : ' + report.report_info['OS Version'] 434 print 435 if 'Crashed Thread' in report.report_info: 436 print 'Crashed Thread : ' + report.report_info['Crashed Thread'] 437 print 438 if 'Event' in report.report_info: 439 print 'Event : ' + report.report_info['Event'] 440 print 441 442 for thread in report.threads: 443 print 444 if thread.did_crash: 445 exc_type = report.report_info['Exception Type'].split(' ')[0] 446 exc_code = report.report_info['Exception Codes'].replace('at', '@') 447 print '*CRASHED* ( ' + exc_type + ' / ' + exc_code + ' )' 448 # Version 7 reports have spindump-style output (with a stepped stack trace), 449 # so remove the first tab to get better alignment. 450 if report.report_version == 7: 451 for line in repr(thread).split('\n'): 452 print line.replace('\t', ' ', 1) 453 else: 454 print thread 455 456 457 def Main(args): 458 """Program main.""" 459 parser = optparse.OptionParser( 460 usage='%prog [options] symbol_path crash_report', 461 description='This will parse and symbolicate an Apple CrashReporter v6-9 ' 462 'file.') 463 parser.add_option('-s', '--std-path', action='store_true', dest='std_path', 464 help='With this flag, the symbol_path is a containing ' 465 'directory, in which a dSYM files are stored in a ' 466 'directory named by the version. Example: ' 467 '[symbolicate_crash.py -s ./symbols/ report.crash] will ' 468 'look for dSYMs in ./symbols/15.0.666.0/ if the report is ' 469 'from that verison.') 470 (options, args) = parser.parse_args(args[1:]) 471 472 # Check that we have something to symbolicate. 473 if len(args) != 2: 474 parser.print_usage() 475 return 1 476 477 report = CrashReport(args[1]) 478 symbol_path = None 479 480 # If not using the standard layout, this is a full path to the symbols. 481 if not options.std_path: 482 symbol_path = args[0] 483 # Otherwise, use the report version to locate symbols in a directory. 484 else: 485 # This is in the format of |M.N.B.P (B.P)|. Get just the part before the 486 # space. 487 chrome_version = report.report_info['Version'].split(' ')[0] 488 symbol_path = os.path.join(args[0], chrome_version) 489 490 # Check that the symbols exist. 491 if not os.path.isdir(symbol_path): 492 print >>sys.stderr, 'Symbol path %s is not a directory' % symbol_path 493 return 2 494 495 print >>sys.stderr, 'Using symbols from ' + symbol_path 496 print >>sys.stderr, '=' * 80 497 498 report.Symbolicate(symbol_path) 499 PrettyPrintReport(report) 500 return 0 501 502 503 if __name__ == '__main__': 504 sys.exit(Main(sys.argv)) 505