Home | History | Annotate | Download | only in hosts
      1 # Copyright 2009 Google Inc. Released under the GPL v2
      2 
      3 """
      4 This module defines the base classes for the Host hierarchy.
      5 
      6 Implementation details:
      7 You should import the "hosts" package instead of importing each type of host.
      8 
      9         Host: a machine on which you can run programs
     10 """
     11 
     12 __author__ = """
     13 mbligh (at] google.com (Martin J. Bligh),
     14 poirier (at] google.com (Benjamin Poirier),
     15 stutsman (at] google.com (Ryan Stutsman)
     16 """
     17 
     18 import cPickle, cStringIO, logging, os, re, time
     19 
     20 from autotest_lib.client.common_lib import global_config, error, utils
     21 from autotest_lib.client.common_lib.cros import path_utils
     22 from autotest_lib.client.common_lib.cros.graphite import autotest_stats
     23 from autotest_lib.client.bin import partition
     24 
     25 
     26 class Host(object):
     27     """
     28     This class represents a machine on which you can run programs.
     29 
     30     It may be a local machine, the one autoserv is running on, a remote
     31     machine or a virtual machine.
     32 
     33     Implementation details:
     34     This is an abstract class, leaf subclasses must implement the methods
     35     listed here. You must not instantiate this class but should
     36     instantiate one of those leaf subclasses.
     37 
     38     When overriding methods that raise NotImplementedError, the leaf class
     39     is fully responsible for the implementation and should not chain calls
     40     to super. When overriding methods that are a NOP in Host, the subclass
     41     should chain calls to super(). The criteria for fitting a new method into
     42     one category or the other should be:
     43         1. If two separate generic implementations could reasonably be
     44            concatenated, then the abstract implementation should pass and
     45            subclasses should chain calls to super.
     46         2. If only one class could reasonably perform the stated function
     47            (e.g. two separate run() implementations cannot both be executed)
     48            then the method should raise NotImplementedError in Host, and
     49            the implementor should NOT chain calls to super, to ensure that
     50            only one implementation ever gets executed.
     51     """
     52 
     53     job = None
     54     DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
     55         "HOSTS", "default_reboot_timeout", type=int, default=1800)
     56     WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
     57         "HOSTS", "wait_down_reboot_timeout", type=int, default=840)
     58     WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value(
     59         "HOSTS", "wait_down_reboot_warning", type=int, default=540)
     60     HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value(
     61         "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5)
     62     # the number of hardware repair requests that need to happen before we
     63     # actually send machines to hardware repair
     64     HARDWARE_REPAIR_REQUEST_THRESHOLD = 4
     65     OP_REBOOT = 'reboot'
     66     OP_SUSPEND = 'suspend'
     67     PWR_OPERATION = [OP_REBOOT, OP_SUSPEND]
     68 
     69 
     70     def __init__(self, *args, **dargs):
     71         self._initialize(*args, **dargs)
     72 
     73 
     74     def _initialize(self, *args, **dargs):
     75         pass
     76 
     77 
     78     def close(self):
     79         pass
     80 
     81 
     82     def setup(self):
     83         pass
     84 
     85 
     86     def run(self, command, timeout=3600, ignore_status=False,
     87             stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
     88             stdin=None, args=()):
     89         """
     90         Run a command on this host.
     91 
     92         @param command: the command line string
     93         @param timeout: time limit in seconds before attempting to
     94                 kill the running process. The run() function
     95                 will take a few seconds longer than 'timeout'
     96                 to complete if it has to kill the process.
     97         @param ignore_status: do not raise an exception, no matter
     98                 what the exit code of the command is.
     99         @param stdout_tee/stderr_tee: where to tee the stdout/stderr
    100         @param stdin: stdin to pass (a string) to the executed command
    101         @param args: sequence of strings to pass as arguments to command by
    102                 quoting them in " and escaping their contents if necessary
    103 
    104         @return a utils.CmdResult object
    105 
    106         @raises AutotestHostRunError: the exit code of the command execution
    107                 was not 0 and ignore_status was not enabled
    108         """
    109         raise NotImplementedError('Run not implemented!')
    110 
    111 
    112     def run_output(self, command, *args, **dargs):
    113         return self.run(command, *args, **dargs).stdout.rstrip()
    114 
    115 
    116     def reboot(self):
    117         raise NotImplementedError('Reboot not implemented!')
    118 
    119 
    120     def suspend(self):
    121         raise NotImplementedError('Suspend not implemented!')
    122 
    123 
    124     def sysrq_reboot(self):
    125         raise NotImplementedError('Sysrq reboot not implemented!')
    126 
    127 
    128     def reboot_setup(self, *args, **dargs):
    129         pass
    130 
    131 
    132     def reboot_followup(self, *args, **dargs):
    133         pass
    134 
    135 
    136     def get_file(self, source, dest, delete_dest=False):
    137         raise NotImplementedError('Get file not implemented!')
    138 
    139 
    140     def send_file(self, source, dest, delete_dest=False):
    141         raise NotImplementedError('Send file not implemented!')
    142 
    143 
    144     def get_tmp_dir(self):
    145         raise NotImplementedError('Get temp dir not implemented!')
    146 
    147 
    148     def is_up(self):
    149         raise NotImplementedError('Is up not implemented!')
    150 
    151 
    152     def is_shutting_down(self):
    153         """ Indicates is a machine is currently shutting down. """
    154         return False
    155 
    156 
    157     def get_wait_up_processes(self):
    158         """ Gets the list of local processes to wait for in wait_up. """
    159         get_config = global_config.global_config.get_config_value
    160         proc_list = get_config("HOSTS", "wait_up_processes",
    161                                default="").strip()
    162         processes = set(p.strip() for p in proc_list.split(","))
    163         processes.discard("")
    164         return processes
    165 
    166 
    167     def get_boot_id(self, timeout=60):
    168         """ Get a unique ID associated with the current boot.
    169 
    170         Should return a string with the semantics such that two separate
    171         calls to Host.get_boot_id() return the same string if the host did
    172         not reboot between the two calls, and two different strings if it
    173         has rebooted at least once between the two calls.
    174 
    175         @param timeout The number of seconds to wait before timing out.
    176 
    177         @return A string unique to this boot or None if not available."""
    178         BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id'
    179         NO_ID_MSG = 'no boot_id available'
    180         cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % (
    181                 BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG)
    182         boot_id = self.run(cmd, timeout=timeout).stdout.strip()
    183         if boot_id == NO_ID_MSG:
    184             return None
    185         return boot_id
    186 
    187 
    188     def wait_up(self, timeout=None):
    189         raise NotImplementedError('Wait up not implemented!')
    190 
    191 
    192     def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
    193         raise NotImplementedError('Wait down not implemented!')
    194 
    195 
    196     def _construct_host_metadata(self, type_str):
    197         """Returns dict of metadata with type_str, hostname, time_recorded.
    198 
    199         @param type_str: String representing _type field in es db.
    200             For example: type_str='reboot_total'.
    201         """
    202         metadata = {
    203             'hostname': self.hostname,
    204             'time_recorded': time.time(),
    205             '_type': type_str,
    206         }
    207         return metadata
    208 
    209 
    210     def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
    211                          down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
    212                          down_warning=WAIT_DOWN_REBOOT_WARNING,
    213                          log_failure=True, old_boot_id=None, **dargs):
    214         """ Wait for the host to come back from a reboot. This is a generic
    215         implementation based entirely on wait_up and wait_down. """
    216         key_string = 'Reboot.%s' % dargs.get('board')
    217 
    218         total_reboot_timer = autotest_stats.Timer('%s.total' % key_string,
    219                 metadata=self._construct_host_metadata('reboot_total'))
    220         wait_down_timer = autotest_stats.Timer('%s.wait_down' % key_string,
    221                 metadata=self._construct_host_metadata('reboot_down'))
    222 
    223         total_reboot_timer.start()
    224         wait_down_timer.start()
    225         if not self.wait_down(timeout=down_timeout,
    226                               warning_timer=down_warning,
    227                               old_boot_id=old_boot_id):
    228             if log_failure:
    229                 self.record("ABORT", None, "reboot.verify", "shut down failed")
    230             raise error.AutoservShutdownError("Host did not shut down")
    231         wait_down_timer.stop()
    232         wait_up_timer = autotest_stats.Timer('%s.wait_up' % key_string,
    233                 metadata=self._construct_host_metadata('reboot_up'))
    234         wait_up_timer.start()
    235         if self.wait_up(timeout):
    236             self.record("GOOD", None, "reboot.verify")
    237             self.reboot_followup(**dargs)
    238             wait_up_timer.stop()
    239             total_reboot_timer.stop()
    240         else:
    241             self.record("ABORT", None, "reboot.verify",
    242                         "Host did not return from reboot")
    243             raise error.AutoservRebootError("Host did not return from reboot")
    244 
    245 
    246     def verify(self):
    247         self.verify_hardware()
    248         self.verify_connectivity()
    249         self.verify_software()
    250 
    251 
    252     def verify_hardware(self):
    253         pass
    254 
    255 
    256     def verify_connectivity(self):
    257         pass
    258 
    259 
    260     def verify_software(self):
    261         pass
    262 
    263 
    264     def check_diskspace(self, path, gb):
    265         """Raises an error if path does not have at least gb GB free.
    266 
    267         @param path The path to check for free disk space.
    268         @param gb A floating point number to compare with a granularity
    269             of 1 MB.
    270 
    271         1000 based SI units are used.
    272 
    273         @raises AutoservDiskFullHostError if path has less than gb GB free.
    274         """
    275         one_mb = 10 ** 6  # Bytes (SI unit).
    276         mb_per_gb = 1000.0
    277         logging.info('Checking for >= %s GB of space under %s on machine %s',
    278                      gb, path, self.hostname)
    279         df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split()
    280         free_space_gb = int(df[3]) / mb_per_gb
    281         if free_space_gb < gb:
    282             raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
    283         else:
    284             logging.info('Found %s GB >= %s GB of space under %s on machine %s',
    285                 free_space_gb, gb, path, self.hostname)
    286 
    287 
    288     def check_inodes(self, path, min_kilo_inodes):
    289         """Raises an error if a file system is short on i-nodes.
    290 
    291         @param path The path to check for free i-nodes.
    292         @param min_kilo_inodes Minimum number of i-nodes required,
    293                                in units of 1000 i-nodes.
    294 
    295         @raises AutoservNoFreeInodesError If the minimum required
    296                                   i-node count isn't available.
    297         """
    298         min_inodes = 1000 * min_kilo_inodes
    299         logging.info('Checking for >= %d i-nodes under %s '
    300                      'on machine %s', min_inodes, path, self.hostname)
    301         df = self.run('df -Pi %s | tail -1' % path).stdout.split()
    302         free_inodes = int(df[3])
    303         if free_inodes < min_inodes:
    304             raise error.AutoservNoFreeInodesError(path, min_inodes,
    305                                                   free_inodes)
    306         else:
    307             logging.info('Found %d >= %d i-nodes under %s on '
    308                          'machine %s', free_inodes, min_inodes,
    309                          path, self.hostname)
    310 
    311 
    312     def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
    313         """Empty a given directory path contents."""
    314         rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
    315         self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
    316 
    317 
    318     def repair(self):
    319         """Try and get the host to pass `self.verify()`."""
    320         self.verify()
    321 
    322 
    323     def disable_ipfilters(self):
    324         """Allow all network packets in and out of the host."""
    325         self.run('iptables-save > /tmp/iptable-rules')
    326         self.run('iptables -P INPUT ACCEPT')
    327         self.run('iptables -P FORWARD ACCEPT')
    328         self.run('iptables -P OUTPUT ACCEPT')
    329 
    330 
    331     def enable_ipfilters(self):
    332         """Re-enable the IP filters disabled from disable_ipfilters()"""
    333         if self.path_exists('/tmp/iptable-rules'):
    334             self.run('iptables-restore < /tmp/iptable-rules')
    335 
    336 
    337     def cleanup(self):
    338         pass
    339 
    340 
    341     def machine_install(self):
    342         raise NotImplementedError('Machine install not implemented!')
    343 
    344 
    345     def install(self, installableObject):
    346         installableObject.install(self)
    347 
    348 
    349     def get_autodir(self):
    350         raise NotImplementedError('Get autodir not implemented!')
    351 
    352 
    353     def set_autodir(self):
    354         raise NotImplementedError('Set autodir not implemented!')
    355 
    356 
    357     def start_loggers(self):
    358         """ Called to start continuous host logging. """
    359         pass
    360 
    361 
    362     def stop_loggers(self):
    363         """ Called to stop continuous host logging. """
    364         pass
    365 
    366 
    367     # some extra methods simplify the retrieval of information about the
    368     # Host machine, with generic implementations based on run(). subclasses
    369     # should feel free to override these if they can provide better
    370     # implementations for their specific Host types
    371 
    372     def get_num_cpu(self):
    373         """ Get the number of CPUs in the host according to /proc/cpuinfo. """
    374         proc_cpuinfo = self.run('cat /proc/cpuinfo',
    375                                 stdout_tee=open(os.devnull, 'w')).stdout
    376         cpus = 0
    377         for line in proc_cpuinfo.splitlines():
    378             if line.startswith('processor'):
    379                 cpus += 1
    380         return cpus
    381 
    382 
    383     def get_arch(self):
    384         """ Get the hardware architecture of the remote machine. """
    385         cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
    386         arch = self.run('%s -m' % cmd_uname).stdout.rstrip()
    387         if re.match(r'i\d86$', arch):
    388             arch = 'i386'
    389         return arch
    390 
    391 
    392     def get_kernel_ver(self):
    393         """ Get the kernel version of the remote machine. """
    394         cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
    395         return self.run('%s -r' % cmd_uname).stdout.rstrip()
    396 
    397 
    398     def get_cmdline(self):
    399         """ Get the kernel command line of the remote machine. """
    400         return self.run('cat /proc/cmdline').stdout.rstrip()
    401 
    402 
    403     def get_meminfo(self):
    404         """ Get the kernel memory info (/proc/meminfo) of the remote machine
    405         and return a dictionary mapping the various statistics. """
    406         meminfo_dict = {}
    407         meminfo = self.run('cat /proc/meminfo').stdout.splitlines()
    408         for key, val in (line.split(':', 1) for line in meminfo):
    409             meminfo_dict[key.strip()] = val.strip()
    410         return meminfo_dict
    411 
    412 
    413     def path_exists(self, path):
    414         """ Determine if path exists on the remote machine. """
    415         result = self.run('ls "%s" > /dev/null' % utils.sh_escape(path),
    416                           ignore_status=True)
    417         return result.exit_status == 0
    418 
    419 
    420     # some extra helpers for doing job-related operations
    421 
    422     def record(self, *args, **dargs):
    423         """ Helper method for recording status logs against Host.job that
    424         silently becomes a NOP if Host.job is not available. The args and
    425         dargs are passed on to Host.job.record unchanged. """
    426         if self.job:
    427             self.job.record(*args, **dargs)
    428 
    429 
    430     def log_kernel(self):
    431         """ Helper method for logging kernel information into the status logs.
    432         Intended for cases where the "current" kernel is not really defined
    433         and we want to explicitly log it. Does nothing if this host isn't
    434         actually associated with a job. """
    435         if self.job:
    436             kernel = self.get_kernel_ver()
    437             self.job.record("INFO", None, None,
    438                             optional_fields={"kernel": kernel})
    439 
    440 
    441     def log_op(self, op, op_func):
    442         """ Decorator for wrapping a management operaiton in a group for status
    443         logging purposes.
    444 
    445         @param op: name of the operation.
    446         @param op_func: a function that carries out the operation
    447                         (reboot, suspend)
    448         """
    449         if self.job and not hasattr(self, "RUNNING_LOG_OP"):
    450             self.RUNNING_LOG_OP = True
    451             try:
    452                 self.job.run_op(op, op_func, self.get_kernel_ver)
    453             finally:
    454                 del self.RUNNING_LOG_OP
    455         else:
    456             op_func()
    457 
    458 
    459     def list_files_glob(self, glob):
    460         """
    461         Get a list of files on a remote host given a glob pattern path.
    462         """
    463         SCRIPT = ("python -c 'import cPickle, glob, sys;"
    464                   "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
    465         output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
    466                           timeout=60).stdout
    467         return cPickle.loads(output)
    468 
    469 
    470     def symlink_closure(self, paths):
    471         """
    472         Given a sequence of path strings, return the set of all paths that
    473         can be reached from the initial set by following symlinks.
    474 
    475         @param paths: sequence of path strings.
    476         @return: a sequence of path strings that are all the unique paths that
    477                 can be reached from the given ones after following symlinks.
    478         """
    479         SCRIPT = ("python -c 'import cPickle, os, sys\n"
    480                   "paths = cPickle.load(sys.stdin)\n"
    481                   "closure = {}\n"
    482                   "while paths:\n"
    483                   "    path = paths.keys()[0]\n"
    484                   "    del paths[path]\n"
    485                   "    if not os.path.exists(path):\n"
    486                   "        continue\n"
    487                   "    closure[path] = None\n"
    488                   "    if os.path.islink(path):\n"
    489                   "        link_to = os.path.join(os.path.dirname(path),\n"
    490                   "                               os.readlink(path))\n"
    491                   "        if link_to not in closure.keys():\n"
    492                   "            paths[link_to] = None\n"
    493                   "cPickle.dump(closure.keys(), sys.stdout, 0)'")
    494         input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
    495         output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
    496                           timeout=60).stdout
    497         return cPickle.loads(output)
    498 
    499 
    500     def cleanup_kernels(self, boot_dir='/boot'):
    501         """
    502         Remove any kernel image and associated files (vmlinux, system.map,
    503         modules) for any image found in the boot directory that is not
    504         referenced by entries in the bootloader configuration.
    505 
    506         @param boot_dir: boot directory path string, default '/boot'
    507         """
    508         # find all the vmlinuz images referenced by the bootloader
    509         vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
    510         boot_info = self.bootloader.get_entries()
    511         used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
    512                         for boot in boot_info.itervalues()]
    513 
    514         # find all the unused vmlinuz images in /boot
    515         all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
    516         used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
    517                                             for kernver in used_kernver)
    518         unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
    519 
    520         # find all the unused vmlinux images in /boot
    521         vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
    522         all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
    523         used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
    524                                             for kernver in used_kernver)
    525         unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
    526 
    527         # find all the unused System.map files in /boot
    528         systemmap_prefix = os.path.join(boot_dir, 'System.map-')
    529         all_system_map = self.list_files_glob(systemmap_prefix + '*')
    530         used_system_map = self.symlink_closure(
    531             systemmap_prefix + kernver for kernver in used_kernver)
    532         unused_system_map = set(all_system_map) - set(used_system_map)
    533 
    534         # find all the module directories associated with unused kernels
    535         modules_prefix = '/lib/modules/'
    536         all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
    537                        if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
    538         used_moddirs = self.symlink_closure(modules_prefix + kernver
    539                                             for kernver in used_kernver)
    540         unused_moddirs = set(all_moddirs) - set(used_moddirs)
    541 
    542         # remove all the vmlinuz files we don't use
    543         # TODO: if needed this should become package manager agnostic
    544         for vmlinuz in unused_vmlinuz:
    545             # try and get an rpm package name
    546             rpm = self.run('rpm -qf', args=(vmlinuz,),
    547                            ignore_status=True, timeout=120)
    548             if rpm.exit_status == 0:
    549                 packages = set(line.strip() for line in
    550                                rpm.stdout.splitlines())
    551                 # if we found some package names, try to remove them
    552                 for package in packages:
    553                     self.run('rpm -e', args=(package,),
    554                              ignore_status=True, timeout=120)
    555             # remove the image files anyway, even if rpm didn't
    556             self.run('rm -f', args=(vmlinuz,),
    557                      ignore_status=True, timeout=120)
    558 
    559         # remove all the vmlinux and System.map files left over
    560         for f in (unused_vmlinux | unused_system_map):
    561             self.run('rm -f', args=(f,),
    562                      ignore_status=True, timeout=120)
    563 
    564         # remove all unused module directories
    565         # the regex match should keep us safe from removing the wrong files
    566         for moddir in unused_moddirs:
    567             self.run('rm -fr', args=(moddir,), ignore_status=True)
    568