1 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import logging 6 import os 7 import re 8 import shutil 9 from autotest_lib.client.common_lib import utils as client_utils 10 from autotest_lib.client.common_lib.cros import dev_server 11 from autotest_lib.client.common_lib.cros import retry 12 from autotest_lib.client.cros import constants 13 from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY 14 from autotest_lib.server.crashcollect import collect_log_file 15 from autotest_lib.server import utils 16 17 try: 18 from chromite.lib import metrics 19 except ImportError: 20 metrics = client_utils.metrics_mock 21 22 23 def generate_minidump_stacktrace(minidump_path): 24 """ 25 Generates a stacktrace for the specified minidump. 26 27 This function expects the debug symbols to reside under: 28 /build/<board>/usr/lib/debug 29 30 @param minidump_path: absolute path to minidump to by symbolicated. 31 @raise client_utils.error.CmdError if minidump_stackwalk return code != 0. 32 """ 33 symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir() 34 logging.info('symbol_dir: %s', symbol_dir) 35 client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' % 36 (minidump_path, symbol_dir, minidump_path)) 37 38 39 def _resolve_crashserver(): 40 """ 41 Attempts to find a devserver / crashserver that has capacity to 42 symbolicate a crashdump. 43 44 @raises DevServerException if no server with capacity could be found. 45 @returns Hostname of resolved server, if found. 46 """ 47 crashserver_name = dev_server.get_least_loaded_devserver( 48 devserver_type=dev_server.CrashServer) 49 if not crashserver_name: 50 metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve' 51 ).increment() 52 raise dev_server.DevServerException( 53 'No crash server has the capacity to symbolicate the dump.') 54 else: 55 metrics.Counter('chromeos/autotest/crashcollect/resolved' 56 ).increment(fields={'crash_server': crashserver_name}) 57 return crashserver_name 58 59 60 def _symbolicate_minidump_with_devserver(minidump_path, resultdir, 61 crashserver_name): 62 """ 63 Generates a stack trace for the specified minidump by consulting devserver. 64 65 This function assumes the debug symbols have been staged on the devserver. 66 67 @param minidump_path: absolute path to minidump to by symbolicated. 68 @param resultdir: server job's result directory. 69 @param crashserver_name: Name of crashserver to attempt to symbolicate with. 70 @raise DevServerException upon failure, HTTP or otherwise. 71 """ 72 # First, look up what build we tested. If we can't find this, we can't 73 # get the right debug symbols, so we might as well give up right now. 74 keyvals = client_utils.read_keyval(resultdir) 75 if JOB_BUILD_KEY not in keyvals: 76 raise dev_server.DevServerException( 77 'Cannot determine build being tested.') 78 79 devserver = dev_server.CrashServer(crashserver_name) 80 81 with metrics.SecondsTimer( 82 'chromeos/autotest/crashcollect/symbolicate_duration', 83 fields={'crash_server': crashserver_name}): 84 trace_text = devserver.symbolicate_dump(minidump_path, 85 keyvals[JOB_BUILD_KEY]) 86 87 if not trace_text: 88 raise dev_server.DevServerException('Unknown error!!') 89 with open(minidump_path + '.txt', 'w') as trace_file: 90 trace_file.write(trace_text) 91 92 def generate_stacktrace_for_file(minidump, host_resultdir): 93 """ 94 Tries to generate a stack trace for the file located at |minidump|. 95 @param minidump: path to minidump file to generate the stacktrace for. 96 @param host_resultdir: server job's result directory. 97 """ 98 # First, try to symbolicate locally. 99 try: 100 logging.info('Trying to generate stack trace locally for %s', minidump) 101 generate_minidump_stacktrace(minidump) 102 logging.info('Generated stack trace for dump %s', minidump) 103 return 104 except client_utils.error.CmdError as err: 105 logging.info('Failed to generate stack trace locally for ' 106 'dump %s (rc=%d):\n%r', 107 minidump, err.result_obj.exit_status, err) 108 109 # If that did not succeed, try to symbolicate using the dev server. 110 try: 111 logging.info('Generating stack trace using devserver for %s', minidump) 112 crashserver_name = _resolve_crashserver() 113 args = (minidump, host_resultdir, crashserver_name) 114 is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver, 115 args=args, 116 timeout_sec=600) 117 if is_timeout: 118 logging.info('Generating stack trace timed out for dump %s', 119 minidump) 120 metrics.Counter( 121 'chromeos/autotest/crashcollect/symbolicate_timed_out' 122 ).increment(fields={'crash_server': crashserver_name}) 123 else: 124 logging.info('Generated stack trace for dump %s', minidump) 125 return 126 except dev_server.DevServerException as e: 127 logging.info('Failed to generate stack trace on devserver for dump ' 128 '%s:\n%r', minidump, e) 129 130 # Symbolicating failed. 131 logging.warning('Failed to generate stack trace for %s (see info logs)', 132 minidump) 133 134 def find_and_generate_minidump_stacktraces(host_resultdir): 135 """ 136 Finds all minidump files and generates a stack trace for each. 137 138 Enumerates all files under the test results directory (recursively) 139 and generates a stack trace file for the minidumps. Minidump files are 140 identified as files with .dmp extension. The stack trace filename is 141 composed by appending the .txt extension to the minidump filename. 142 143 @param host_resultdir: Directory to walk looking for dmp files. 144 145 @returns The list of all found minidump files. Each dump may or may not have 146 been symbolized. 147 """ 148 minidumps = [] 149 for file in _find_crashdumps(host_resultdir): 150 generate_stacktrace_for_file(file, host_resultdir) 151 minidumps.append(file) 152 return minidumps 153 154 155 def _find_crashdumps(host_resultdir): 156 """Find crashdumps. 157 158 @param host_resultdir The result directory for this host for this test run. 159 """ 160 for dir, subdirs, files in os.walk(host_resultdir): 161 for file in files: 162 if file.endswith('.dmp'): 163 yield os.path.join(dir, file) 164 165 166 def _find_orphaned_crashdumps(host): 167 """Return file paths of crashdumps on host. 168 169 @param host A host object of the device. 170 """ 171 return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*')) 172 173 174 def report_crashdumps(host): 175 """Report on crashdumps for host. 176 177 This is run when no tests failed. We don't process crashdumps in this 178 case because of devserver load, but they should still be reported. 179 180 @param host A host object of the device we're to pull crashes from. 181 """ 182 for crashfile in _find_orphaned_crashdumps(host): 183 logging.warning('Host crashdump exists: %s', crashfile) 184 host.job.record('INFO', None, None, 185 'Host crashdump exists: %s' % (crashfile,)) 186 187 host_resultdir = _get_host_resultdir(host) 188 for crashfile in _find_crashdumps(host_resultdir): 189 logging.warning('Local crashdump exists: %s', crashfile) 190 host.job.record('INFO', None, None, 191 'Local crashdump exists: %s' % (crashfile,)) 192 193 194 def fetch_orphaned_crashdumps(host, infodir): 195 """ 196 Copy all of the crashes in the crash directory over to the results folder. 197 198 @param host A host object of the device we're to pull crashes from. 199 @param infodir The directory to fetch crashdumps into. 200 @return The list of minidumps that we pulled back from the host. 201 """ 202 if not os.path.exists(infodir): 203 os.mkdir(infodir) 204 orphans = [] 205 206 if not host.check_cached_up_status(): 207 logging.warning('Host %s did not answer to ping, skip fetching ' 208 'orphaned crashdumps.', host.hostname) 209 return orphans 210 211 try: 212 for file in _find_orphaned_crashdumps(host): 213 logging.info('Collecting %s...', file) 214 collect_log_file(host, file, infodir, clean=True) 215 orphans.append(file) 216 except Exception as e: 217 logging.warning('Collection of orphaned crash dumps failed %s', e) 218 finally: 219 # Delete infodir if we have no orphans 220 if not orphans: 221 logging.info('There are no orphaned crashes; deleting %s', infodir) 222 os.rmdir(infodir) 223 return orphans 224 225 226 def _copy_to_debug_dir(host_resultdir, filename): 227 """ 228 Copies a file to the debug dir under host_resultdir. 229 230 @param host_resultdir The result directory for this host for this test run. 231 @param filename The full path of the file to copy to the debug folder. 232 """ 233 debugdir = os.path.join(host_resultdir, 'debug') 234 src = filename 235 dst = os.path.join(debugdir, os.path.basename(filename)) 236 237 try: 238 shutil.copyfile(src, dst) 239 logging.info('Copied %s to %s', src, dst) 240 except IOError: 241 logging.warning('Failed to copy %s to %s', src, dst) 242 243 244 def _get_host_resultdir(host): 245 """Get resultdir for host. 246 247 @param host A host object of the device we're to pull crashes from. 248 """ 249 return getattr(getattr(host, 'job', None), 'resultdir', None) 250 251 252 def get_host_infodir(host): 253 """Get infodir for host. 254 255 @param host A host object of the device we're to pull crashes from. 256 """ 257 host_resultdir = _get_host_resultdir(host) 258 return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname) 259 260 261 def get_site_crashdumps(host, test_start_time): 262 """ 263 Copy all of the crashdumps from a host to the results directory. 264 265 @param host The host object from which to pull crashes 266 @param test_start_time When the test we just ran started. 267 @return A list of all the minidumps 268 """ 269 host_resultdir = _get_host_resultdir(host) 270 infodir = get_host_infodir(host) 271 272 orphans = fetch_orphaned_crashdumps(host, infodir) 273 minidumps = find_and_generate_minidump_stacktraces(host_resultdir) 274 275 # Record all crashdumps in status.log of the job: 276 # - If one server job runs several client jobs we will only record 277 # crashdumps in the status.log of the high level server job. 278 # - We will record these crashdumps whether or not we successfully 279 # symbolicate them. 280 if host.job and minidumps or orphans: 281 host.job.record('INFO', None, None, 'Start crashcollection record') 282 for minidump in minidumps: 283 host.job.record('INFO', None, 'New Crash Dump', minidump) 284 for orphan in orphans: 285 host.job.record('INFO', None, 'Orphaned Crash Dump', orphan) 286 host.job.record('INFO', None, None, 'End crashcollection record') 287 288 orphans.extend(minidumps) 289 290 for minidump in orphans: 291 report_bug_from_crash(host, minidump) 292 293 # We copy Chrome crash information to the debug dir to assist debugging. 294 # Since orphans occurred on a previous run, they are most likely not 295 # relevant to the current failure, so we don't copy them. 296 for minidump in minidumps: 297 minidump_no_ext = os.path.splitext(minidump)[0] 298 _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt') 299 _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log') 300 301 return orphans 302 303 304 def find_package_of(host, exec_name): 305 """ 306 Find the package that an executable came from. 307 308 @param host A host object that has the executable. 309 @param exec_name Name of or path to executable. 310 @return The name of the package that installed the executable. 311 """ 312 # Run "portageq owners" on "host" to determine which package owns 313 # "exec_name." Portageq queue output consists of package names followed 314 # tab-prefixed path names. For example, owners of "python:" 315 # 316 # sys-devel/gdb-7.7.1-r2 317 # /usr/share/gdb/python 318 # chromeos-base/dev-install-0.0.1-r711 319 # /usr/bin/python 320 # dev-lang/python-2.7.3-r7 321 # /etc/env.d/python 322 # 323 # This gets piped into "xargs stat" to annotate each line with 324 # information about the path, so we later can consider only packages 325 # with executable files. After annotation the above looks like: 326 # 327 # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ... 328 # stat: cannot stat '/usr/share/gdb/python': ... 329 # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ... 330 # 755 -rwxr-xr-x /usr/bin/python 331 # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ... 332 # 755 drwxr-xr-x /etc/env.d/python 333 # 334 # Package names are surrounded by "@@@" to facilitate parsing. Lines 335 # starting with an octal number were successfully annotated, because 336 # the path existed on "host." 337 # The above is then parsed to find packages which contain executable files 338 # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711." 339 # 340 # TODO(milleral): portageq can show scary looking error messages 341 # in the debug logs via stderr. We only look at stdout, so those 342 # get filtered, but it would be good to silence them. 343 cmd = ('portageq owners / ' + exec_name + 344 r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"' 345 r'| tr \\n \\0' 346 ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1') 347 portageq = host.run(cmd, ignore_status=True) 348 349 # Parse into a set of names of packages containing an executable file. 350 packages = set() 351 pkg = '' 352 pkg_re = re.compile('@@@ (.*) @@@') 353 path_re = re.compile('^([0-7]{3,}) (.)') 354 for line in portageq.stdout.splitlines(): 355 match = pkg_re.search(line) 356 if match: 357 pkg = match.group(1) 358 continue 359 match = path_re.match(line) 360 if match: 361 isexec = int(match.group(1), 8) & 0o111 362 isfile = match.group(2) == '-' 363 if pkg and isexec and isfile: 364 packages.add(pkg) 365 366 # If exactly one package found it must be the one we want, return it. 367 if len(packages) == 1: 368 return packages.pop() 369 370 # TODO(milleral): Decide if it really is an error if not exactly one 371 # package is found. 372 # It is highly questionable as to if this should be left in the 373 # production version of this code or not. 374 if len(packages) == 0: 375 logging.warning('find_package_of() found no packages for "%s"', 376 exec_name) 377 else: 378 logging.warning('find_package_of() found multiple packages for "%s": ' 379 '%s', exec_name, ', '.join(packages)) 380 return '' 381 382 383 def report_bug_from_crash(host, minidump_path): 384 """ 385 Given a host to query and a minidump, file a bug about the crash. 386 387 @param host A host object that is where the dump came from 388 @param minidump_path The path to the dump file that should be reported. 389 """ 390 # TODO(milleral): Once this has actually been tested, remove the 391 # try/except. In the meantime, let's make sure nothing dies because of 392 # the fact that this code isn't very heavily tested. 393 try: 394 meta_path = os.path.splitext(minidump_path)[0] + '.meta' 395 with open(meta_path, 'r') as f: 396 for line in f.readlines(): 397 parts = line.split('=') 398 if parts[0] == 'exec_name': 399 package = find_package_of(host, parts[1].strip()) 400 if not package: 401 package = '<unknown package>' 402 logging.info('Would report crash on %s.', package) 403 break 404 except Exception as e: 405 logging.warning('Crash detection failed with: %s', e) 406