Home | History | Annotate | Download | only in server
      1 import collections
      2 import logging
      3 import os
      4 import pipes
      5 import random
      6 import shutil
      7 import time
      8 
      9 import common
     10 from autotest_lib.client.bin.result_tools import runner as result_tools_runner
     11 from autotest_lib.client.common_lib import error
     12 from autotest_lib.client.common_lib import global_config
     13 from autotest_lib.client.cros import constants
     14 from autotest_lib.server import utils
     15 
     16 try:
     17     from chromite.lib import metrics
     18 except ImportError:
     19     metrics = utils.metrics_mock
     20 
     21 
     22 # The amortized max filesize to collect.  For example, if _MAX_FILESIZE is 10
     23 # then we would collect a file with size 20 half the time, and a file with size
     24 # 40 a quarter of the time, so that in the long run we are collecting files
     25 # with this max size.
     26 _MAX_FILESIZE = 64 * (2 ** 20)  # 64 MiB
     27 
     28 class _RemoteTempDir(object):
     29 
     30     """Context manager for temporary directory on remote host."""
     31 
     32     def __init__(self, host):
     33         self.host = host
     34         self.tmpdir = None
     35 
     36     def __repr__(self):
     37         return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
     38             cls=type(self).__name__, this=self)
     39 
     40     def __enter__(self):
     41         self.tmpdir = (self.host
     42                        .run('mktemp -d', stdout_tee=None)
     43                        .stdout.strip())
     44         return self.tmpdir
     45 
     46     def __exit__(self, exc_type, exc_value, exc_tb):
     47         self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
     48 
     49 
     50 def _collect_log_file_with_summary(host, source_path, dest_path):
     51     """Collects a log file from the remote machine with directory summary.
     52 
     53     @param host: The RemoteHost to collect logs from.
     54     @param source_path: The remote path to collect the log file from.
     55     @param dest_path: A path (file or directory) to write the copies logs into.
     56     """
     57     # Build test result directory summary
     58     summary_created = result_tools_runner.run_on_client(host, source_path)
     59 
     60     skip_summary_collection = True
     61     try:
     62         host.get_file(source_path, dest_path, preserve_perm=False)
     63         skip_summary_collection = False
     64     finally:
     65         if summary_created:
     66             # If dest_path is a file, use its parent folder to store the
     67             # directory summary file.
     68             if os.path.isfile(dest_path):
     69                 dest_path = os.path.dirname(dest_path)
     70             # If dest_path doesn't exist, that means get_file failed, there is
     71             # no need to collect directory summary file.
     72             skip_summary_collection |= not os.path.exists(dest_path)
     73             result_tools_runner.collect_last_summary(
     74                     host, source_path, dest_path,
     75                     skip_summary_collection=skip_summary_collection)
     76 
     77 
     78 def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False,
     79                      clean_content=False):
     80     """Collects a log file from the remote machine.
     81 
     82     Log files are collected from the remote machine and written into the
     83     destination path. If dest_path is a directory, the log file will be named
     84     using the basename of the remote log path.
     85 
     86     Very large files will randomly not be collected, to alleviate network
     87     traffic in the case of widespread crashes dumping large core files. Note
     88     that this check only applies to the exact file passed as log_path. For
     89     example, if this is a directory, the size of the contents will not be
     90     checked.
     91 
     92     @param host: The RemoteHost to collect logs from
     93     @param log_path: The remote path to collect the log file from
     94     @param dest_path: A path (file or directory) to write the copies logs into
     95     @param use_tmp: If True, will first copy the logs to a temporary directory
     96                     on the host and download logs from there.
     97     @param clean: If True, remove dest_path after upload attempt even if it
     98                   failed.
     99     @param clean_content: If True, remove files and directories in dest_path
    100             after upload attempt even if it failed.
    101 
    102     """
    103     logging.info('Collecting %s...', log_path)
    104     if not host.check_cached_up_status():
    105         logging.warning('Host %s did not answer to ping, skip collecting log '
    106                         'file %s.', host.hostname, log_path)
    107         return
    108     try:
    109         file_stats = _get_file_stats(host, log_path)
    110         if not file_stats:
    111             # Failed to get file stat, the file may not exist.
    112             return
    113 
    114         if (not result_tools_runner.ENABLE_RESULT_THROTTLING and
    115             random.random() > file_stats.collection_probability):
    116             logging.warning('Collection of %s skipped:'
    117                             'size=%s, collection_probability=%s',
    118                             log_path, file_stats.size,
    119                             file_stats.collection_probability)
    120         elif use_tmp:
    121             _collect_log_file_with_tmpdir(host, log_path, dest_path)
    122         else:
    123             _collect_log_file_with_summary(host, log_path, dest_path)
    124     except Exception as e:
    125         logging.exception('Non-critical failure: collection of %s failed: %s',
    126                           log_path, e)
    127     finally:
    128         if clean_content:
    129             path_to_delete = os.path.join(pipes.quote(log_path), '*')
    130         elif clean:
    131             path_to_delete = pipes.quote(log_path)
    132         if clean or clean_content:
    133             host.run('rm -rf %s' % path_to_delete, ignore_status=True)
    134 
    135 
    136 _FileStats = collections.namedtuple('_FileStats',
    137                                     'size collection_probability')
    138 
    139 
    140 def _collect_log_file_with_tmpdir(host, log_path, dest_path):
    141     """Collect log file from host through a temp directory on the host.
    142 
    143     @param host: The RemoteHost to collect logs from.
    144     @param log_path: The remote path to collect the log file from.
    145     @param dest_path: A path (file or directory) to write the copies logs into.
    146 
    147     """
    148     with _RemoteTempDir(host) as tmpdir:
    149         host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
    150         source_path = os.path.join(tmpdir, os.path.basename(log_path))
    151 
    152         _collect_log_file_with_summary(host, source_path, dest_path)
    153 
    154 
    155 def _get_file_stats(host, path):
    156     """Get the stats of a file from host.
    157 
    158     @param host: Instance of Host subclass with run().
    159     @param path: Path of file to check.
    160     @returns: _FileStats namedtuple with file size and collection probability.
    161     """
    162     cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
    163     output = None
    164     file_size = 0
    165     try:
    166         output = host.run(cmd).stdout
    167     except error.CmdError as e:
    168         logging.warning('Getting size of file %r on host %r failed: %s. '
    169                         'Default its size to 0', path, host, e)
    170     try:
    171         if output is not None:
    172             file_size = int(output)
    173     except ValueError:
    174         logging.warning('Failed to convert size string "%s" for %s on host %r. '
    175                         'File may not exist.', output, path, host)
    176         return
    177 
    178     if file_size == 0:
    179         return _FileStats(0, 1.0)
    180     else:
    181         collection_probability = _MAX_FILESIZE / float(file_size)
    182         return _FileStats(file_size, collection_probability)
    183 
    184 
    185 # import any site hooks for the crashdump and crashinfo collection
    186 get_site_crashdumps = utils.import_site_function(
    187     __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
    188     lambda host, test_start_time: None)
    189 get_site_crashinfo = utils.import_site_function(
    190     __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
    191     lambda host, test_start_time: None)
    192 report_crashdumps = utils.import_site_function(
    193     __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
    194     lambda host: None)
    195 fetch_orphaned_crashdumps = utils.import_site_function(
    196     __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
    197     lambda host, host_resultdir: None)
    198 get_host_infodir = utils.import_site_function(
    199     __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
    200     lambda host: None)
    201 
    202 
    203 @metrics.SecondsTimerDecorator(
    204         'chromeos/autotest/autoserv/get_crashdumps_duration')
    205 def get_crashdumps(host, test_start_time):
    206     get_site_crashdumps(host, test_start_time)
    207 
    208 
    209 @metrics.SecondsTimerDecorator(
    210         'chromeos/autotest/autoserv/get_crashinfo_duration')
    211 def get_crashinfo(host, test_start_time):
    212     logging.info("Collecting crash information...")
    213 
    214     # get_crashdumps collects orphaned crashdumps and symbolicates all
    215     # collected crashdumps. Symbolicating could happen
    216     # during a postjob task as well, at which time some crashdumps could have
    217     # already been pulled back from machine. So it doesn't necessarily need
    218     # to wait for the machine to come up.
    219     get_crashdumps(host, test_start_time)
    220 
    221     if wait_for_machine_to_recover(host):
    222         # run any site-specific collection
    223         get_site_crashinfo(host, test_start_time)
    224 
    225         crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
    226         collect_messages(host)
    227         collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
    228         collect_uncollected_logs(host)
    229 
    230         # Collect everything in /var/log.
    231         log_path = os.path.join(crashinfo_dir, 'var')
    232         os.makedirs(log_path)
    233         collect_log_file(host, constants.LOG_DIR, log_path)
    234 
    235         # Collect console-ramoops.  The filename has changed in linux-3.19,
    236         # so collect all the files in the pstore dirs.
    237         log_path = os.path.join(crashinfo_dir, 'pstore')
    238         for pstore_dir in constants.LOG_PSTORE_DIRS:
    239             collect_log_file(host, pstore_dir, log_path, use_tmp=True,
    240                              clean_content=True)
    241         # Collect i915_error_state, only available on intel systems.
    242         # i915 contains the Intel graphics state. It might contain useful data
    243         # when a DUT hangs, times out or crashes.
    244         log_path = os.path.join(
    245                 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
    246         collect_log_file(host, constants.LOG_I915_ERROR_STATE,
    247                          log_path, use_tmp=True)
    248 
    249 
    250 # Load default for number of hours to wait before giving up on crash collection.
    251 HOURS_TO_WAIT = global_config.global_config.get_config_value(
    252     'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
    253 
    254 
    255 def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
    256     """Wait for a machine (possibly down) to become accessible again.
    257 
    258     @param host: A RemoteHost instance to wait on
    259     @param hours_to_wait: Number of hours to wait before giving up
    260 
    261     @returns: True if the machine comes back up, False otherwise
    262     """
    263     current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
    264     if host.is_up():
    265         logging.info("%s already up, collecting crash info", host.hostname)
    266         return True
    267 
    268     logging.info("Waiting %s hours for %s to come up (%s)",
    269                  hours_to_wait, host.hostname, current_time)
    270     if not host.wait_up(timeout=hours_to_wait * 3600):
    271         (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
    272          .increment())
    273         logging.warning("%s down, unable to collect crash info",
    274                         host.hostname)
    275         return False
    276     else:
    277         logging.info("%s is back up, collecting crash info", host.hostname)
    278         return True
    279 
    280 
    281 def get_crashinfo_dir(host, dir_prefix):
    282     """Find and if necessary create a directory to store crashinfo in.
    283 
    284     @param host: The RemoteHost object that crashinfo will be collected from
    285     @param dir_prefix: Prefix of directory name.
    286 
    287     @returns: The path to an existing directory for writing crashinfo into
    288     """
    289     host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
    290     if host_resultdir:
    291         infodir = host_resultdir
    292     else:
    293         infodir = os.path.abspath(os.getcwd())
    294     infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
    295     if not os.path.exists(infodir):
    296         os.mkdir(infodir)
    297     return infodir
    298 
    299 
    300 def collect_command(host, command, dest_path):
    301     """Collects the result of a command on the remote machine.
    302 
    303     The standard output of the command will be collected and written into the
    304     desitionation path. The destination path is assumed to be filename and
    305     not a directory.
    306 
    307     @param host: The RemoteHost to collect from
    308     @param command: A shell command to run on the remote machine and capture
    309         the output from.
    310     @param dest_path: A file path to write the results of the log into
    311     """
    312     logging.info("Collecting '%s' ...", command)
    313     try:
    314         result = host.run(command, stdout_tee=None).stdout
    315         utils.open_write_close(dest_path, result)
    316     except Exception, e:
    317         logging.warning("Collection of '%s' failed:\n%s", command, e)
    318 
    319 
    320 def collect_uncollected_logs(host):
    321     """Collects any leftover uncollected logs from the client.
    322 
    323     @param host: The RemoteHost to collect from
    324     """
    325     if host.job:
    326         try:
    327             logs = host.job.get_client_logs()
    328             for hostname, remote_path, local_path in logs:
    329                 if hostname == host.hostname:
    330                     logging.info('Retrieving logs from %s:%s into %s',
    331                                  hostname, remote_path, local_path)
    332                     collect_log_file(host, remote_path + '/', local_path + '/')
    333         except Exception, e:
    334             logging.warning('Error while trying to collect stranded '
    335                             'Autotest client logs: %s', e)
    336 
    337 
    338 def collect_messages(host):
    339     """Collects the 'new' contents of /var/log/messages.
    340 
    341     If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
    342     the contents of /var/log/messages excluding whatever initial contents
    343     are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
    344     present, simply collects the entire contents of /var/log/messages.
    345 
    346     @param host: The RemoteHost to collect from
    347     """
    348     crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
    349 
    350     try:
    351         # paths to the messages files
    352         messages = os.path.join(crashinfo_dir, "messages")
    353         messages_raw = os.path.join(crashinfo_dir, "messages.raw")
    354         messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
    355 
    356         # grab the files from the remote host
    357         collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
    358                          messages_at_start)
    359         collect_log_file(host, "/var/log/messages", messages_raw)
    360 
    361         # figure out how much of messages.raw to skip
    362         if os.path.exists(messages_at_start):
    363             # if the first lines of the messages at start should match the
    364             # first lines of the current messages; if they don't then messages
    365             # has been erase or rotated and we just grab all of it
    366             first_line_at_start = utils.read_one_line(messages_at_start)
    367             first_line_now = utils.read_one_line(messages_raw)
    368             if first_line_at_start != first_line_now:
    369                 size_at_start = 0
    370             else:
    371                 size_at_start = os.path.getsize(messages_at_start)
    372         else:
    373             size_at_start = 0
    374         raw_messages_file = open(messages_raw)
    375         messages_file = open(messages, "w")
    376         raw_messages_file.seek(size_at_start)
    377         shutil.copyfileobj(raw_messages_file, messages_file)
    378         raw_messages_file.close()
    379         messages_file.close()
    380 
    381         # get rid of the "raw" versions of messages
    382         os.remove(messages_raw)
    383         if os.path.exists(messages_at_start):
    384             os.remove(messages_at_start)
    385     except Exception, e:
    386         logging.warning("Error while collecting /var/log/messages: %s", e)
    387