Home | History | Annotate | Download | only in hosts
      1 #
      2 # Copyright 2007 Google Inc. Released under the GPL v2
      3 
      4 """
      5 This module defines the SSHHost class.
      6 
      7 Implementation details:
      8 You should import the "hosts" package instead of importing each type of host.
      9 
     10         SSHHost: a remote machine with a ssh access
     11 """
     12 
     13 import inspect
     14 import logging
     15 import re
     16 import time
     17 
     18 import common
     19 from autotest_lib.client.common_lib import error
     20 from autotest_lib.client.common_lib import pxssh
     21 from autotest_lib.server import utils
     22 from autotest_lib.server.hosts import abstract_ssh
     23 
     24 # In case cros_host is being ran via SSP on an older Moblab version with an
     25 # older chromite version.
     26 try:
     27     from chromite.lib import metrics
     28 except ImportError:
     29     metrics = utils.metrics_mock
     30 
     31 
     32 class SSHHost(abstract_ssh.AbstractSSHHost):
     33     """
     34     This class represents a remote machine controlled through an ssh
     35     session on which you can run programs.
     36 
     37     It is not the machine autoserv is running on. The machine must be
     38     configured for password-less login, for example through public key
     39     authentication.
     40 
     41     It includes support for controlling the machine through a serial
     42     console on which you can run programs. If such a serial console is
     43     set up on the machine then capabilities such as hard reset and
     44     boot strap monitoring are available. If the machine does not have a
     45     serial console available then ordinary SSH-based commands will
     46     still be available, but attempts to use extensions such as
     47     console logging or hard reset will fail silently.
     48 
     49     Implementation details:
     50     This is a leaf class in an abstract class hierarchy, it must
     51     implement the unimplemented methods in parent classes.
     52     """
     53 
     54     def _initialize(self, hostname, *args, **dargs):
     55         """
     56         Construct a SSHHost object
     57 
     58         Args:
     59                 hostname: network hostname or address of remote machine
     60         """
     61         super(SSHHost, self)._initialize(hostname=hostname, *args, **dargs)
     62         self.setup_ssh()
     63 
     64 
     65     def ssh_command(self, connect_timeout=30, options='', alive_interval=300,
     66                     alive_count_max=3, connection_attempts=1):
     67         """
     68         Construct an ssh command with proper args for this host.
     69 
     70         @param connect_timeout: connection timeout (in seconds)
     71         @param options: SSH options
     72         @param alive_interval: SSH Alive interval.
     73         @param alive_count_max: SSH AliveCountMax.
     74         @param connection_attempts: SSH ConnectionAttempts
     75         """
     76         options = " ".join([options, self._master_ssh.ssh_option])
     77         base_cmd = self.make_ssh_command(user=self.user, port=self.port,
     78                                          opts=options,
     79                                          hosts_file=self.known_hosts_file,
     80                                          connect_timeout=connect_timeout,
     81                                          alive_interval=alive_interval,
     82                                          alive_count_max=alive_count_max,
     83                                          connection_attempts=connection_attempts)
     84         return "%s %s" % (base_cmd, self.hostname)
     85 
     86     def _get_server_stack_state(self, lowest_frames=0, highest_frames=None):
     87         """ Get the server stack frame status.
     88         @param lowest_frames: the lowest frames to start printing.
     89         @param highest_frames: the highest frames to print.
     90                         (None means no restriction)
     91         """
     92         stack_frames = inspect.stack()
     93         stack = ''
     94         for frame in stack_frames[lowest_frames:highest_frames]:
     95             function_name = inspect.getframeinfo(frame[0]).function
     96             stack = '%s|%s' % (function_name, stack)
     97         del stack_frames
     98         return stack[:-1] # Delete the last '|' character
     99 
    100     def _verbose_logger_command(self, command):
    101         """
    102         Prepend the command for the client with information about the ssh
    103         command to be executed and the server stack state.
    104 
    105         @param command: the ssh command to be executed.
    106         """
    107         # The last 3 frames on the stack are boring. Print 6-3=3 stack frames.
    108         stack = self._get_server_stack_state(lowest_frames=3, highest_frames=6)
    109         # If "logger" executable exists on the DUT use it to respew |command|.
    110         # Then regardless of "logger" run |command| as usual.
    111         command = ('if type "logger" > /dev/null 2>&1; then'
    112                    ' logger -tag "autotest" "server[stack::%s] -> ssh_run(%s)";'
    113                    'fi; '
    114                    '%s' % (stack, utils.sh_escape(command), command))
    115         return command
    116 
    117 
    118     def _run(self, command, timeout, ignore_status,
    119              stdout, stderr, connect_timeout, env, options, stdin, args,
    120              ignore_timeout, ssh_failure_retry_ok):
    121         """Helper function for run()."""
    122         if connect_timeout > timeout:
    123             connect_timeout = int(timeout)
    124 
    125         ssh_cmd = self.ssh_command(connect_timeout, options)
    126         if not env.strip():
    127             env = ""
    128         else:
    129             env = "export %s;" % env
    130         for arg in args:
    131             command += ' "%s"' % utils.sh_escape(arg)
    132         full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command))
    133 
    134         # TODO(jrbarnette):  crbug.com/484726 - When we're in an SSP
    135         # container, sometimes shortly after reboot we will see DNS
    136         # resolution errors on ssh commands; the problem never
    137         # occurs more than once in a row.  This especially affects
    138         # the autoupdate_Rollback test, but other cases have been
    139         # affected, too.
    140         #
    141         # We work around it by detecting the first DNS resolution error
    142         # and retrying exactly one time.
    143         dns_error_retry_count = 1
    144 
    145         def counters_inc(counter_name, failure_name):
    146             """Helper function to increment metrics counters.
    147             @param counter_name: string indicating which counter to use
    148             @param failure_name: string indentifying an error, or 'success'
    149             """
    150             if counter_name == 'call':
    151                 # ssh_counter records the outcome of each ssh invocation
    152                 # inside _run(), including exceptions.
    153                 ssh_counter = metrics.Counter('chromeos/autotest/ssh/calls')
    154                 fields = {'error' : failure_name or 'success',
    155                           'attempt' : ssh_call_count}
    156                 ssh_counter.increment(fields=fields)
    157 
    158             if counter_name == 'run':
    159                 # run_counter records each call to _run() with its result
    160                 # and how many tries were made.  Calls are recorded when
    161                 # _run() exits (including exiting with an exception)
    162                 run_counter = metrics.Counter('chromeos/autotest/ssh/runs')
    163                 fields = {'error' : failure_name or 'success',
    164                           'attempt' : ssh_call_count}
    165                 run_counter.increment(fields=fields)
    166 
    167         # If ssh_failure_retry_ok is True, retry twice on timeouts and generic
    168         # error 255: if a simple retry doesn't work, kill the ssh master
    169         # connection and try again.  (Note that either error could come from
    170         # the command running in the DUT, in which case the retry may be
    171         # useless but, in theory, also harmless.)
    172         if ssh_failure_retry_ok:
    173             # Ignore ssh command timeout, even though it could be a timeout due
    174             # to the command executing in the remote host.  Note that passing
    175             # ignore_timeout = True makes utils.run() return None on timeouts
    176             # (and only on timeouts).
    177             original_ignore_timeout = ignore_timeout
    178             ignore_timeout = True
    179             ssh_failure_retry_count = 2
    180         else:
    181             ssh_failure_retry_count = 0
    182 
    183         ssh_call_count = 0
    184 
    185         while True:
    186             try:
    187                 # Increment call count first, in case utils.run() throws an
    188                 # exception.
    189                 ssh_call_count += 1
    190                 result = utils.run(full_cmd, timeout, True, stdout, stderr,
    191                                    verbose=False, stdin=stdin,
    192                                    stderr_is_expected=ignore_status,
    193                                    ignore_timeout=ignore_timeout)
    194             except Exception as e:
    195                 # No retries on exception.
    196                 counters_inc('call', 'exception')
    197                 counters_inc('run', 'exception')
    198                 raise e
    199 
    200             failure_name = None
    201 
    202             if result:
    203                 if result.exit_status == 255:
    204                     if re.search(r'^ssh: .*: Name or service not known',
    205                                  result.stderr):
    206                         failure_name = 'dns_failure'
    207                     else:
    208                         failure_name = 'error_255'
    209                 elif result.exit_status > 0:
    210                     failure_name = 'nonzero_status'
    211             else:
    212                 # result == None
    213                 failure_name = 'timeout'
    214 
    215             # Record the outcome of the ssh invocation.
    216             counters_inc('call', failure_name)
    217 
    218             if failure_name:
    219                 # There was a failure: decide whether to retry.
    220                 if failure_name == 'dns_failure':
    221                     if dns_error_retry_count > 0:
    222                         logging.debug('retrying ssh because of DNS failure')
    223                         dns_error_retry_count -= 1
    224                         continue
    225                 else:
    226                     if ssh_failure_retry_count == 2:
    227                         logging.debug('retrying ssh command after %s',
    228                                        failure_name)
    229                         ssh_failure_retry_count -= 1
    230                         continue
    231                     elif ssh_failure_retry_count == 1:
    232                         # After two failures, restart the master connection
    233                         # before the final try.
    234                         logging.debug('retry 2: restarting master connection')
    235                         self.restart_master_ssh()
    236                         # Last retry: reinstate timeout behavior.
    237                         ignore_timeout = original_ignore_timeout
    238                         ssh_failure_retry_count -= 1
    239                         continue
    240 
    241             # No retry conditions occurred.  Exit the loop.
    242             break
    243 
    244         # The outcomes of ssh invocations have been recorded.  Now record
    245         # the outcome of this function.
    246 
    247         if ignore_timeout and not result:
    248             counters_inc('run', 'ignored_timeout')
    249             return None
    250 
    251         # The error messages will show up in band (indistinguishable
    252         # from stuff sent through the SSH connection), so we have the
    253         # remote computer echo the message "Connected." before running
    254         # any command.  Since the following 2 errors have to do with
    255         # connecting, it's safe to do these checks.
    256         if result.exit_status == 255:
    257             if re.search(r'^ssh: connect to host .* port .*: '
    258                          r'Connection timed out\r$', result.stderr):
    259                 counters_inc('run', 'final_timeout')
    260                 raise error.AutoservSSHTimeout("ssh timed out", result)
    261             if "Permission denied." in result.stderr:
    262                 msg = "ssh permission denied"
    263                 counters_inc('run', 'final_eperm')
    264                 raise error.AutoservSshPermissionDeniedError(msg, result)
    265 
    266         if not ignore_status and result.exit_status > 0:
    267             counters_inc('run', 'final_run_error')
    268             raise error.AutoservRunError("command execution error", result)
    269 
    270         counters_inc('run', failure_name)
    271         return result
    272 
    273 
    274     def run_very_slowly(self, command, timeout=None, ignore_status=False,
    275             stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
    276             connect_timeout=30, options='', stdin=None, verbose=True, args=(),
    277             ignore_timeout=False, ssh_failure_retry_ok=False):
    278         """
    279         Run a command on the remote host.
    280         This RPC call has an overhead of minimum 40ms and up to 400ms on
    281         servers (crbug.com/734887). Each time a run_very_slowly is added for
    282         every job - a server core dies in the lab.
    283         @see common_lib.hosts.host.run()
    284 
    285         @param timeout: command execution timeout in seconds. Default is 1 hour.
    286         @param connect_timeout: ssh connection timeout (in seconds)
    287         @param options: string with additional ssh command options
    288         @param verbose: log the commands
    289         @param ignore_timeout: bool True if SSH command timeouts should be
    290                 ignored.  Will return None on command timeout.
    291         @param ssh_failure_retry_ok: True if the command may be retried on
    292                 probable ssh failure (error 255 or timeout).  When true,
    293                 the command may be executed up to three times, the second
    294                 time after restarting the ssh master connection.  Use only for
    295                 commands that are idempotent, because when a "probable
    296                 ssh failure" occurs, we cannot tell if the command executed
    297                 or not.
    298 
    299         @raises AutoservRunError: if the command failed
    300         @raises AutoservSSHTimeout: ssh connection has timed out
    301         """
    302         if timeout is None:
    303             timeout = 3600
    304         start_time = time.time()
    305         with metrics.SecondsTimer('chromeos/autotest/ssh/master_ssh_time',
    306                                   scale=0.001):
    307             if verbose:
    308                 stack = self._get_server_stack_state(lowest_frames=1,
    309                                                      highest_frames=7)
    310                 logging.debug("Running (ssh) '%s' from '%s'", command, stack)
    311                 command = self._verbose_logger_command(command)
    312 
    313             self.start_master_ssh(min(
    314                     timeout,
    315                     self.DEFAULT_START_MASTER_SSH_TIMEOUT_S,
    316             ))
    317 
    318             env = " ".join("=".join(pair) for pair in self.env.iteritems())
    319             elapsed = time.time() - start_time
    320             try:
    321                 return self._run(command, timeout - elapsed, ignore_status,
    322                                  stdout_tee, stderr_tee, connect_timeout, env,
    323                                  options, stdin, args, ignore_timeout,
    324                                  ssh_failure_retry_ok)
    325             except error.CmdError, cmderr:
    326                 # We get a CmdError here only if there is timeout of that
    327                 # command. Catch that and stuff it into AutoservRunError and
    328                 # raise it.
    329                 timeout_message = str('Timeout encountered: %s' %
    330                                       cmderr.args[0])
    331                 raise error.AutoservRunError(timeout_message, cmderr.args[1])
    332 
    333 
    334     def run(self, *args, **kwargs):
    335         return self.run_very_slowly(*args, **kwargs)
    336 
    337 
    338     def run_background(self, command, verbose=True):
    339         """Start a command on the host in the background.
    340 
    341         The command is started on the host in the background, and
    342         this method call returns immediately without waiting for the
    343         command's completion.  The PID of the process on the host is
    344         returned as a string.
    345 
    346         The command may redirect its stdin, stdout, or stderr as
    347         necessary.  Without redirection, all input and output will
    348         use /dev/null.
    349 
    350         @param command The command to run in the background
    351         @param verbose As for `self.run()`
    352 
    353         @return Returns the PID of the remote background process
    354                 as a string.
    355         """
    356         # Redirection here isn't merely hygienic; it's a functional
    357         # requirement.  sshd won't terminate until stdin, stdout,
    358         # and stderr are all closed.
    359         #
    360         # The subshell is needed to do the right thing in case the
    361         # passed in command has its own I/O redirections.
    362         cmd_fmt = '( %s ) </dev/null >/dev/null 2>&1 & echo -n $!'
    363         return self.run(cmd_fmt % command, verbose=verbose).stdout
    364 
    365 
    366     def run_short(self, command, **kwargs):
    367         """
    368         Calls the run() command with a short default timeout.
    369 
    370         Takes the same arguments as does run(),
    371         with the exception of the timeout argument which
    372         here is fixed at 60 seconds.
    373         It returns the result of run.
    374 
    375         @param command: the command line string
    376 
    377         """
    378         return self.run(command, timeout=60, **kwargs)
    379 
    380 
    381     def run_grep(self, command, timeout=30, ignore_status=False,
    382                  stdout_ok_regexp=None, stdout_err_regexp=None,
    383                  stderr_ok_regexp=None, stderr_err_regexp=None,
    384                  connect_timeout=30):
    385         """
    386         Run a command on the remote host and look for regexp
    387         in stdout or stderr to determine if the command was
    388         successul or not.
    389 
    390 
    391         @param command: the command line string
    392         @param timeout: time limit in seconds before attempting to
    393                         kill the running process. The run() function
    394                         will take a few seconds longer than 'timeout'
    395                         to complete if it has to kill the process.
    396         @param ignore_status: do not raise an exception, no matter
    397                               what the exit code of the command is.
    398         @param stdout_ok_regexp: regexp that should be in stdout
    399                                  if the command was successul.
    400         @param stdout_err_regexp: regexp that should be in stdout
    401                                   if the command failed.
    402         @param stderr_ok_regexp: regexp that should be in stderr
    403                                  if the command was successul.
    404         @param stderr_err_regexp: regexp that should be in stderr
    405                                  if the command failed.
    406         @param connect_timeout: connection timeout (in seconds)
    407 
    408         Returns:
    409                 if the command was successul, raises an exception
    410                 otherwise.
    411 
    412         Raises:
    413                 AutoservRunError:
    414                 - the exit code of the command execution was not 0.
    415                 - If stderr_err_regexp is found in stderr,
    416                 - If stdout_err_regexp is found in stdout,
    417                 - If stderr_ok_regexp is not found in stderr.
    418                 - If stdout_ok_regexp is not found in stdout,
    419         """
    420 
    421         # We ignore the status, because we will handle it at the end.
    422         result = self.run(command, timeout, ignore_status=True,
    423                           connect_timeout=connect_timeout)
    424 
    425         # Look for the patterns, in order
    426         for (regexp, stream) in ((stderr_err_regexp, result.stderr),
    427                                  (stdout_err_regexp, result.stdout)):
    428             if regexp and stream:
    429                 err_re = re.compile (regexp)
    430                 if err_re.search(stream):
    431                     raise error.AutoservRunError(
    432                         '%s failed, found error pattern: "%s"' % (command,
    433                                                                 regexp), result)
    434 
    435         for (regexp, stream) in ((stderr_ok_regexp, result.stderr),
    436                                  (stdout_ok_regexp, result.stdout)):
    437             if regexp and stream:
    438                 ok_re = re.compile (regexp)
    439                 if ok_re.search(stream):
    440                     if ok_re.search(stream):
    441                         return
    442 
    443         if not ignore_status and result.exit_status > 0:
    444             raise error.AutoservRunError("command execution error", result)
    445 
    446 
    447     def setup_ssh_key(self):
    448         """Setup SSH Key"""
    449         logging.debug('Performing SSH key setup on %s as %s.',
    450                       self.host_port, self.user)
    451 
    452         try:
    453             host = pxssh.pxssh()
    454             host.login(self.hostname, self.user, self.password,
    455                         port=self.port)
    456             public_key = utils.get_public_key()
    457 
    458             host.sendline('mkdir -p ~/.ssh')
    459             host.prompt()
    460             host.sendline('chmod 700 ~/.ssh')
    461             host.prompt()
    462             host.sendline("echo '%s' >> ~/.ssh/authorized_keys; " %
    463                             public_key)
    464             host.prompt()
    465             host.sendline('chmod 600 ~/.ssh/authorized_keys')
    466             host.prompt()
    467             host.logout()
    468 
    469             logging.debug('SSH key setup complete.')
    470 
    471         except:
    472             logging.debug('SSH key setup has failed.')
    473             try:
    474                 host.logout()
    475             except:
    476                 pass
    477 
    478 
    479     def setup_ssh(self):
    480         """Setup SSH"""
    481         if self.password:
    482             try:
    483                 self.ssh_ping()
    484             except error.AutoservSshPingHostError:
    485                 self.setup_ssh_key()
    486