Home | History | Annotate | Download | only in server
      1 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import logging
      6 import os
      7 import re
      8 import shutil
      9 from autotest_lib.client.common_lib import utils as client_utils
     10 from autotest_lib.client.common_lib.cros import dev_server
     11 from autotest_lib.client.common_lib.cros import retry
     12 from autotest_lib.client.cros import constants
     13 from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY
     14 from autotest_lib.server.crashcollect import collect_log_file
     15 from autotest_lib.server import utils
     16 
     17 try:
     18     from chromite.lib import metrics
     19 except ImportError:
     20     metrics = client_utils.metrics_mock
     21 
     22 
     23 def generate_minidump_stacktrace(minidump_path):
     24     """
     25     Generates a stacktrace for the specified minidump.
     26 
     27     This function expects the debug symbols to reside under:
     28         /build/<board>/usr/lib/debug
     29 
     30     @param minidump_path: absolute path to minidump to by symbolicated.
     31     @raise client_utils.error.CmdError if minidump_stackwalk return code != 0.
     32     """
     33     symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir()
     34     logging.info('symbol_dir: %s', symbol_dir)
     35     client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
     36                      (minidump_path, symbol_dir, minidump_path))
     37 
     38 
     39 def _resolve_crashserver():
     40     """
     41     Attempts to find a devserver / crashserver that has capacity to
     42     symbolicate a crashdump.
     43 
     44     @raises DevServerException if no server with capacity could be found.
     45     @returns Hostname of resolved server, if found.
     46     """
     47     crashserver_name = dev_server.get_least_loaded_devserver(
     48             devserver_type=dev_server.CrashServer)
     49     if not crashserver_name:
     50         metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve'
     51                         ).increment()
     52         raise dev_server.DevServerException(
     53                 'No crash server has the capacity to symbolicate the dump.')
     54     else:
     55         metrics.Counter('chromeos/autotest/crashcollect/resolved'
     56                         ).increment(fields={'crash_server': crashserver_name})
     57     return crashserver_name
     58 
     59 
     60 def _symbolicate_minidump_with_devserver(minidump_path, resultdir,
     61                                         crashserver_name):
     62     """
     63     Generates a stack trace for the specified minidump by consulting devserver.
     64 
     65     This function assumes the debug symbols have been staged on the devserver.
     66 
     67     @param minidump_path: absolute path to minidump to by symbolicated.
     68     @param resultdir: server job's result directory.
     69     @param crashserver_name: Name of crashserver to attempt to symbolicate with.
     70     @raise DevServerException upon failure, HTTP or otherwise.
     71     """
     72     # First, look up what build we tested.  If we can't find this, we can't
     73     # get the right debug symbols, so we might as well give up right now.
     74     keyvals = client_utils.read_keyval(resultdir)
     75     if JOB_BUILD_KEY not in keyvals:
     76         raise dev_server.DevServerException(
     77             'Cannot determine build being tested.')
     78 
     79     devserver = dev_server.CrashServer(crashserver_name)
     80 
     81     with metrics.SecondsTimer(
     82             'chromeos/autotest/crashcollect/symbolicate_duration',
     83             fields={'crash_server': crashserver_name}):
     84         trace_text = devserver.symbolicate_dump(minidump_path,
     85                                                 keyvals[JOB_BUILD_KEY])
     86 
     87     if not trace_text:
     88         raise dev_server.DevServerException('Unknown error!!')
     89     with open(minidump_path + '.txt', 'w') as trace_file:
     90         trace_file.write(trace_text)
     91 
     92 def generate_stacktrace_for_file(minidump, host_resultdir):
     93     """
     94     Tries to generate a stack trace for the file located at |minidump|.
     95     @param minidump: path to minidump file to generate the stacktrace for.
     96     @param host_resultdir: server job's result directory.
     97     """
     98     # First, try to symbolicate locally.
     99     try:
    100         logging.info('Trying to generate stack trace locally for %s', minidump)
    101         generate_minidump_stacktrace(minidump)
    102         logging.info('Generated stack trace for dump %s', minidump)
    103         return
    104     except client_utils.error.CmdError as err:
    105         logging.info('Failed to generate stack trace locally for '
    106                      'dump %s (rc=%d):\n%r',
    107                      minidump, err.result_obj.exit_status, err)
    108 
    109     # If that did not succeed, try to symbolicate using the dev server.
    110     try:
    111         logging.info('Generating stack trace using devserver for %s', minidump)
    112         crashserver_name = _resolve_crashserver()
    113         args = (minidump, host_resultdir, crashserver_name)
    114         is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver,
    115                                       args=args,
    116                                       timeout_sec=600)
    117         if is_timeout:
    118             logging.info('Generating stack trace timed out for dump %s',
    119                          minidump)
    120             metrics.Counter(
    121                     'chromeos/autotest/crashcollect/symbolicate_timed_out'
    122             ).increment(fields={'crash_server': crashserver_name})
    123         else:
    124             logging.info('Generated stack trace for dump %s', minidump)
    125             return
    126     except dev_server.DevServerException as e:
    127         logging.info('Failed to generate stack trace on devserver for dump '
    128                      '%s:\n%r', minidump, e)
    129 
    130     # Symbolicating failed.
    131     logging.warning('Failed to generate stack trace for %s (see info logs)',
    132                     minidump)
    133 
    134 def find_and_generate_minidump_stacktraces(host_resultdir):
    135     """
    136     Finds all minidump files and generates a stack trace for each.
    137 
    138     Enumerates all files under the test results directory (recursively)
    139     and generates a stack trace file for the minidumps.  Minidump files are
    140     identified as files with .dmp extension.  The stack trace filename is
    141     composed by appending the .txt extension to the minidump filename.
    142 
    143     @param host_resultdir: Directory to walk looking for dmp files.
    144 
    145     @returns The list of all found minidump files. Each dump may or may not have
    146              been symbolized.
    147     """
    148     minidumps = []
    149     for file in _find_crashdumps(host_resultdir):
    150         generate_stacktrace_for_file(file, host_resultdir)
    151         minidumps.append(file)
    152     return minidumps
    153 
    154 
    155 def _find_crashdumps(host_resultdir):
    156     """Find crashdumps.
    157 
    158     @param host_resultdir The result directory for this host for this test run.
    159     """
    160     for dir, subdirs, files in os.walk(host_resultdir):
    161         for file in files:
    162             if file.endswith('.dmp'):
    163                 yield os.path.join(dir, file)
    164 
    165 
    166 def _find_orphaned_crashdumps(host):
    167     """Return file paths of crashdumps on host.
    168 
    169     @param host A host object of the device.
    170     """
    171     return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*'))
    172 
    173 
    174 def report_crashdumps(host):
    175     """Report on crashdumps for host.
    176 
    177     This is run when no tests failed.  We don't process crashdumps in this
    178     case because of devserver load, but they should still be reported.
    179 
    180     @param host A host object of the device we're to pull crashes from.
    181     """
    182     for crashfile in _find_orphaned_crashdumps(host):
    183         logging.warning('Host crashdump exists: %s', crashfile)
    184         host.job.record('INFO', None, None,
    185                         'Host crashdump exists: %s' % (crashfile,))
    186 
    187     host_resultdir = _get_host_resultdir(host)
    188     for crashfile in _find_crashdumps(host_resultdir):
    189         logging.warning('Local crashdump exists: %s', crashfile)
    190         host.job.record('INFO', None, None,
    191                         'Local crashdump exists: %s' % (crashfile,))
    192 
    193 
    194 def fetch_orphaned_crashdumps(host, infodir):
    195     """
    196     Copy all of the crashes in the crash directory over to the results folder.
    197 
    198     @param host A host object of the device we're to pull crashes from.
    199     @param infodir The directory to fetch crashdumps into.
    200     @return The list of minidumps that we pulled back from the host.
    201     """
    202     if not os.path.exists(infodir):
    203         os.mkdir(infodir)
    204     orphans = []
    205 
    206     if not host.check_cached_up_status():
    207         logging.warning('Host %s did not answer to ping, skip fetching '
    208                         'orphaned crashdumps.', host.hostname)
    209         return orphans
    210 
    211     try:
    212         for file in _find_orphaned_crashdumps(host):
    213             logging.info('Collecting %s...', file)
    214             collect_log_file(host, file, infodir, clean=True)
    215             orphans.append(file)
    216     except Exception as e:
    217         logging.warning('Collection of orphaned crash dumps failed %s', e)
    218     finally:
    219         # Delete infodir if we have no orphans
    220         if not orphans:
    221             logging.info('There are no orphaned crashes; deleting %s', infodir)
    222             os.rmdir(infodir)
    223     return orphans
    224 
    225 
    226 def _copy_to_debug_dir(host_resultdir, filename):
    227     """
    228     Copies a file to the debug dir under host_resultdir.
    229 
    230     @param host_resultdir The result directory for this host for this test run.
    231     @param filename The full path of the file to copy to the debug folder.
    232     """
    233     debugdir = os.path.join(host_resultdir, 'debug')
    234     src = filename
    235     dst = os.path.join(debugdir, os.path.basename(filename))
    236 
    237     try:
    238         shutil.copyfile(src, dst)
    239         logging.info('Copied %s to %s', src, dst)
    240     except IOError:
    241         logging.warning('Failed to copy %s to %s', src, dst)
    242 
    243 
    244 def _get_host_resultdir(host):
    245     """Get resultdir for host.
    246 
    247     @param host A host object of the device we're to pull crashes from.
    248     """
    249     return getattr(getattr(host, 'job', None), 'resultdir', None)
    250 
    251 
    252 def get_host_infodir(host):
    253     """Get infodir for host.
    254 
    255     @param host A host object of the device we're to pull crashes from.
    256     """
    257     host_resultdir = _get_host_resultdir(host)
    258     return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname)
    259 
    260 
    261 def get_site_crashdumps(host, test_start_time):
    262     """
    263     Copy all of the crashdumps from a host to the results directory.
    264 
    265     @param host The host object from which to pull crashes
    266     @param test_start_time When the test we just ran started.
    267     @return A list of all the minidumps
    268     """
    269     host_resultdir = _get_host_resultdir(host)
    270     infodir = get_host_infodir(host)
    271 
    272     orphans = fetch_orphaned_crashdumps(host, infodir)
    273     minidumps = find_and_generate_minidump_stacktraces(host_resultdir)
    274 
    275     # Record all crashdumps in status.log of the job:
    276     # - If one server job runs several client jobs we will only record
    277     # crashdumps in the status.log of the high level server job.
    278     # - We will record these crashdumps whether or not we successfully
    279     # symbolicate them.
    280     if host.job and minidumps or orphans:
    281         host.job.record('INFO', None, None, 'Start crashcollection record')
    282         for minidump in minidumps:
    283             host.job.record('INFO', None, 'New Crash Dump', minidump)
    284         for orphan in orphans:
    285             host.job.record('INFO', None, 'Orphaned Crash Dump', orphan)
    286         host.job.record('INFO', None, None, 'End crashcollection record')
    287 
    288     orphans.extend(minidumps)
    289 
    290     for minidump in orphans:
    291         report_bug_from_crash(host, minidump)
    292 
    293     # We copy Chrome crash information to the debug dir to assist debugging.
    294     # Since orphans occurred on a previous run, they are most likely not
    295     # relevant to the current failure, so we don't copy them.
    296     for minidump in minidumps:
    297         minidump_no_ext = os.path.splitext(minidump)[0]
    298         _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt')
    299         _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log')
    300 
    301     return orphans
    302 
    303 
    304 def find_package_of(host, exec_name):
    305     """
    306     Find the package that an executable came from.
    307 
    308     @param host A host object that has the executable.
    309     @param exec_name Name of or path to executable.
    310     @return The name of the package that installed the executable.
    311     """
    312     # Run "portageq owners" on "host" to determine which package owns
    313     # "exec_name."  Portageq queue output consists of package names followed
    314     # tab-prefixed path names.  For example, owners of "python:"
    315     #
    316     # sys-devel/gdb-7.7.1-r2
    317     #         /usr/share/gdb/python
    318     # chromeos-base/dev-install-0.0.1-r711
    319     #         /usr/bin/python
    320     # dev-lang/python-2.7.3-r7
    321     #         /etc/env.d/python
    322     #
    323     # This gets piped into "xargs stat" to annotate each line with
    324     # information about the path, so we later can consider only packages
    325     # with executable files.  After annotation the above looks like:
    326     #
    327     # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ...
    328     # stat: cannot stat '/usr/share/gdb/python': ...
    329     # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ...
    330     # 755 -rwxr-xr-x /usr/bin/python
    331     # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ...
    332     # 755 drwxr-xr-x /etc/env.d/python
    333     #
    334     # Package names are surrounded by "@@@" to facilitate parsing.  Lines
    335     # starting with an octal number were successfully annotated, because
    336     # the path existed on "host."
    337     # The above is then parsed to find packages which contain executable files
    338     # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711."
    339     #
    340     # TODO(milleral): portageq can show scary looking error messages
    341     # in the debug logs via stderr. We only look at stdout, so those
    342     # get filtered, but it would be good to silence them.
    343     cmd = ('portageq owners / ' + exec_name +
    344             r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"'
    345             r'| tr \\n \\0'
    346             ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1')
    347     portageq = host.run(cmd, ignore_status=True)
    348 
    349     # Parse into a set of names of packages containing an executable file.
    350     packages = set()
    351     pkg = ''
    352     pkg_re = re.compile('@@@ (.*) @@@')
    353     path_re = re.compile('^([0-7]{3,}) (.)')
    354     for line in portageq.stdout.splitlines():
    355         match = pkg_re.search(line)
    356         if match:
    357             pkg = match.group(1)
    358             continue
    359         match = path_re.match(line)
    360         if match:
    361             isexec = int(match.group(1), 8) & 0o111
    362             isfile = match.group(2) == '-'
    363             if pkg and isexec and isfile:
    364                 packages.add(pkg)
    365 
    366     # If exactly one package found it must be the one we want, return it.
    367     if len(packages) == 1:
    368         return packages.pop()
    369 
    370     # TODO(milleral): Decide if it really is an error if not exactly one
    371     # package is found.
    372     # It is highly questionable as to if this should be left in the
    373     # production version of this code or not.
    374     if len(packages) == 0:
    375         logging.warning('find_package_of() found no packages for "%s"',
    376                         exec_name)
    377     else:
    378         logging.warning('find_package_of() found multiple packages for "%s": '
    379                         '%s', exec_name, ', '.join(packages))
    380     return ''
    381 
    382 
    383 def report_bug_from_crash(host, minidump_path):
    384     """
    385     Given a host to query and a minidump, file a bug about the crash.
    386 
    387     @param host A host object that is where the dump came from
    388     @param minidump_path The path to the dump file that should be reported.
    389     """
    390     # TODO(milleral): Once this has actually been tested, remove the
    391     # try/except. In the meantime, let's make sure nothing dies because of
    392     # the fact that this code isn't very heavily tested.
    393     try:
    394         meta_path = os.path.splitext(minidump_path)[0] + '.meta'
    395         with open(meta_path, 'r') as f:
    396             for line in f.readlines():
    397                 parts = line.split('=')
    398                 if parts[0] == 'exec_name':
    399                     package = find_package_of(host, parts[1].strip())
    400                     if not package:
    401                         package = '<unknown package>'
    402                     logging.info('Would report crash on %s.', package)
    403                     break
    404     except Exception as e:
    405         logging.warning('Crash detection failed with: %s', e)
    406