Home | History | Annotate | Download | only in server
      1 import os, time, logging, shutil
      2 
      3 from autotest_lib.client.common_lib import global_config
      4 from autotest_lib.client.common_lib.cros.graphite import autotest_stats
      5 from autotest_lib.client.cros import constants
      6 from autotest_lib.server import utils
      7 
      8 
      9 # import any site hooks for the crashdump and crashinfo collection
     10 get_site_crashdumps = utils.import_site_function(
     11     __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
     12     lambda host, test_start_time: None)
     13 get_site_crashinfo = utils.import_site_function(
     14     __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
     15     lambda host, test_start_time: None)
     16 
     17 
     18 _timer = autotest_stats.Timer('crash_collection')
     19 
     20 @_timer.decorate
     21 def get_crashdumps(host, test_start_time):
     22     get_site_crashdumps(host, test_start_time)
     23 
     24 
     25 @_timer.decorate
     26 def get_crashinfo(host, test_start_time):
     27     logging.info("Collecting crash information...")
     28 
     29     # get_crashdumps collects orphaned crashdumps and symbolicates all
     30     # collected crashdumps. Symbolicating could happen
     31     # during a postjob task as well, at which time some crashdumps could have
     32     # already been pulled back from machine. So it doesn't necessarily need
     33     # to wait for the machine to come up.
     34     get_crashdumps(host, test_start_time)
     35 
     36     if wait_for_machine_to_recover(host):
     37         # run any site-specific collection
     38         get_site_crashinfo(host, test_start_time)
     39 
     40         crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
     41         collect_messages(host)
     42         collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
     43         collect_uncollected_logs(host)
     44 
     45         # Collect everything in /var/log.
     46         log_path = os.path.join(crashinfo_dir, 'var')
     47         os.makedirs(log_path)
     48         collect_log_file(host, constants.LOG_DIR, log_path)
     49 
     50         # Collect console-ramoops
     51         log_path = os.path.join(
     52                 crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS))
     53         collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path)
     54         # Collect i915_error_state, only available on intel systems.
     55         # i915 contains the Intel graphics state. It might contain useful data
     56         # when a DUT hangs, times out or crashes.
     57         log_path = os.path.join(
     58                 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
     59         collect_log_file(host, constants.LOG_I915_ERROR_STATE,
     60                          log_path, use_tmp=True)
     61 
     62 
     63 # Load default for number of hours to wait before giving up on crash collection.
     64 HOURS_TO_WAIT = global_config.global_config.get_config_value(
     65     'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
     66 
     67 
     68 def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
     69     """Wait for a machine (possibly down) to become accessible again.
     70 
     71     @param host: A RemoteHost instance to wait on
     72     @param hours_to_wait: Number of hours to wait before giving up
     73 
     74     @returns: True if the machine comes back up, False otherwise
     75     """
     76     current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
     77     if host.is_up():
     78         logging.info("%s already up, collecting crash info", host.hostname)
     79         return True
     80 
     81     logging.info("Waiting %s hours for %s to come up (%s)",
     82                  hours_to_wait, host.hostname, current_time)
     83     if not host.wait_up(timeout=hours_to_wait * 3600):
     84         autotest_stats.Counter('collect_crashinfo_timeout').increment()
     85         logging.warning("%s down, unable to collect crash info",
     86                         host.hostname)
     87         return False
     88     else:
     89         logging.info("%s is back up, collecting crash info", host.hostname)
     90         return True
     91 
     92 
     93 def get_crashinfo_dir(host, dir_prefix):
     94     """Find and if necessary create a directory to store crashinfo in.
     95 
     96     @param host: The RemoteHost object that crashinfo will be collected from
     97     @param dir_prefix: Prefix of directory name.
     98 
     99     @returns: The path to an existing directory for writing crashinfo into
    100     """
    101     host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
    102     if host_resultdir:
    103         infodir = host_resultdir
    104     else:
    105         infodir = os.path.abspath(os.getcwd())
    106     infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
    107     if not os.path.exists(infodir):
    108         os.mkdir(infodir)
    109     return infodir
    110 
    111 
    112 def collect_log_file(host, log_path, dest_path, use_tmp=False):
    113     """Collects a log file from the remote machine.
    114 
    115     Log files are collected from the remote machine and written into the
    116     destination path. If dest_path is a directory, the log file will be named
    117     using the basename of the remote log path.
    118 
    119     @param host: The RemoteHost to collect logs from
    120     @param log_path: The remote path to collect the log file from
    121     @param dest_path: A path (file or directory) to write the copies logs into
    122     @param use_tmp: If True, will first copy the logs to a temporary directory
    123                     on the host and download logs from there.
    124 
    125     """
    126     logging.info('Collecting %s...', log_path)
    127     try:
    128         source_path = log_path
    129         if use_tmp:
    130             devnull = open('/dev/null', 'w')
    131             tmpdir = host.run('mktemp -d', stdout_tee=devnull).stdout.strip()
    132             host.run('cp -rp %s %s' % (log_path, tmpdir))
    133             source_path = os.path.join(tmpdir, os.path.basename(log_path))
    134         host.get_file(source_path, dest_path, preserve_perm=False)
    135         if use_tmp:
    136             host.run('rm -rf %s' % tmpdir)
    137     except Exception, e:
    138         logging.warning('Collection of %s failed: %s', log_path, e)
    139 
    140 
    141 def collect_command(host, command, dest_path):
    142     """Collects the result of a command on the remote machine.
    143 
    144     The standard output of the command will be collected and written into the
    145     desitionation path. The destination path is assumed to be filename and
    146     not a directory.
    147 
    148     @param host: The RemoteHost to collect from
    149     @param command: A shell command to run on the remote machine and capture
    150         the output from.
    151     @param dest_path: A file path to write the results of the log into
    152     """
    153     logging.info("Collecting '%s' ...", command)
    154     devnull = open("/dev/null", "w")
    155     try:
    156         try:
    157             result = host.run(command, stdout_tee=devnull).stdout
    158             utils.open_write_close(dest_path, result)
    159         except Exception, e:
    160             logging.warning("Collection of '%s' failed:\n%s", command, e)
    161     finally:
    162         devnull.close()
    163 
    164 
    165 def collect_uncollected_logs(host):
    166     """Collects any leftover uncollected logs from the client.
    167 
    168     @param host: The RemoteHost to collect from
    169     """
    170     if host.job:
    171         try:
    172             logs = host.job.get_client_logs()
    173             for hostname, remote_path, local_path in logs:
    174                 if hostname == host.hostname:
    175                     logging.info("Retrieving logs from %s:%s into %s",
    176                                  hostname, remote_path, local_path)
    177                     host.get_file(remote_path + "/", local_path + "/")
    178         except Exception, e:
    179             logging.warning("Error while trying to collect stranded "
    180                             "Autotest client logs: %s", e)
    181 
    182 
    183 def collect_messages(host):
    184     """Collects the 'new' contents of /var/log/messages.
    185 
    186     If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
    187     the contents of /var/log/messages excluding whatever initial contents
    188     are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
    189     present, simply collects the entire contents of /var/log/messages.
    190 
    191     @param host: The RemoteHost to collect from
    192     """
    193     crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
    194 
    195     try:
    196         # paths to the messages files
    197         messages = os.path.join(crashinfo_dir, "messages")
    198         messages_raw = os.path.join(crashinfo_dir, "messages.raw")
    199         messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
    200 
    201         # grab the files from the remote host
    202         collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
    203                          messages_at_start)
    204         collect_log_file(host, "/var/log/messages", messages_raw)
    205 
    206         # figure out how much of messages.raw to skip
    207         if os.path.exists(messages_at_start):
    208             # if the first lines of the messages at start should match the
    209             # first lines of the current messages; if they don't then messages
    210             # has been erase or rotated and we just grab all of it
    211             first_line_at_start = utils.read_one_line(messages_at_start)
    212             first_line_now = utils.read_one_line(messages_raw)
    213             if first_line_at_start != first_line_now:
    214                 size_at_start = 0
    215             else:
    216                 size_at_start = os.path.getsize(messages_at_start)
    217         else:
    218             size_at_start = 0
    219         raw_messages_file = open(messages_raw)
    220         messages_file = open(messages, "w")
    221         raw_messages_file.seek(size_at_start)
    222         shutil.copyfileobj(raw_messages_file, messages_file)
    223         raw_messages_file.close()
    224         messages_file.close()
    225 
    226         # get rid of the "raw" versions of messages
    227         os.remove(messages_raw)
    228         if os.path.exists(messages_at_start):
    229             os.remove(messages_at_start)
    230     except Exception, e:
    231         logging.warning("Error while collecting /var/log/messages: %s", e)
    232