Home | History | Annotate | Download | only in scheduler
      1 #pylint: disable-msg=C0111
      2 
      3 """
      4 Pidfile monitor.
      5 """
      6 
      7 import logging
      8 import time
      9 import traceback
     10 
     11 import common
     12 
     13 from autotest_lib.client.common_lib import utils
     14 from autotest_lib.client.common_lib import global_config
     15 from autotest_lib.scheduler import drone_manager
     16 from autotest_lib.scheduler import scheduler_config
     17 
     18 try:
     19     from chromite.lib import metrics
     20 except ImportError:
     21     metrics = utils.metrics_mock
     22 
     23 
     24 def _get_pidfile_timeout_secs():
     25     """@returns How long to wait for autoserv to write pidfile."""
     26     pidfile_timeout_mins = global_config.global_config.get_config_value(
     27             scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
     28     return pidfile_timeout_mins * 60
     29 
     30 
     31 class PidfileRunMonitor(object):
     32     """
     33     Client must call either run() to start a new process or
     34     attach_to_existing_process().
     35     """
     36 
     37     class _PidfileException(Exception):
     38         """
     39         Raised when there's some unexpected behavior with the pid file, but only
     40         used internally (never allowed to escape this class).
     41         """
     42 
     43 
     44     def __init__(self):
     45         self._drone_manager = drone_manager.instance()
     46         self.lost_process = False
     47         self._start_time = None
     48         self.pidfile_id = None
     49         self._killed = False
     50         self._state = drone_manager.PidfileContents()
     51 
     52 
     53     def _add_nice_command(self, command, nice_level):
     54         if not nice_level:
     55             return command
     56         return ['nice', '-n', str(nice_level)] + command
     57 
     58 
     59     def _set_start_time(self):
     60         self._start_time = time.time()
     61 
     62 
     63     def run(self, command, working_directory, num_processes, nice_level=None,
     64             log_file=None, pidfile_name=None, paired_with_pidfile=None,
     65             username=None, drone_hostnames_allowed=None):
     66         assert command is not None
     67         if nice_level is not None:
     68             command = ['nice', '-n', str(nice_level)] + command
     69         self._set_start_time()
     70         self.pidfile_id = self._drone_manager.execute_command(
     71             command, working_directory, pidfile_name=pidfile_name,
     72             num_processes=num_processes, log_file=log_file,
     73             paired_with_pidfile=paired_with_pidfile, username=username,
     74             drone_hostnames_allowed=drone_hostnames_allowed)
     75 
     76 
     77     def attach_to_existing_process(self, execution_path,
     78                                    pidfile_name=drone_manager.AUTOSERV_PID_FILE,
     79                                    num_processes=None):
     80         self._set_start_time()
     81         self.pidfile_id = self._drone_manager.get_pidfile_id_from(
     82             execution_path, pidfile_name=pidfile_name)
     83         if num_processes is not None:
     84             self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
     85 
     86 
     87     def kill(self):
     88         if self.has_process():
     89             self._drone_manager.kill_process(self.get_process())
     90             self._killed = True
     91 
     92 
     93     def has_process(self):
     94         self._get_pidfile_info()
     95         return self._state.process is not None
     96 
     97 
     98     def get_process(self):
     99         self._get_pidfile_info()
    100         assert self._state.process is not None
    101         return self._state.process
    102 
    103 
    104     def _read_pidfile(self, use_second_read=False):
    105         assert self.pidfile_id is not None, (
    106             'You must call run() or attach_to_existing_process()')
    107         contents = self._drone_manager.get_pidfile_contents(
    108             self.pidfile_id, use_second_read=use_second_read)
    109         if contents.is_invalid():
    110             self._state = drone_manager.PidfileContents()
    111             raise self._PidfileException(contents)
    112         self._state = contents
    113 
    114 
    115     def _handle_pidfile_error(self, error, message=''):
    116         self.on_lost_process(self._state.process)
    117 
    118 
    119     def _get_pidfile_info_helper(self):
    120         if self.lost_process:
    121             return
    122 
    123         self._read_pidfile()
    124 
    125         if self._state.process is None:
    126             self._handle_no_process()
    127             return
    128 
    129         if self._state.exit_status is None:
    130             # double check whether or not autoserv is running
    131             if self._drone_manager.is_process_running(self._state.process):
    132                 return
    133 
    134             # pid but no running process - maybe process *just* exited
    135             self._read_pidfile(use_second_read=True)
    136             if self._state.exit_status is None:
    137                 # autoserv exited without writing an exit code
    138                 # to the pidfile
    139                 self._handle_pidfile_error(
    140                     'autoserv died without writing exit code')
    141 
    142 
    143     def _get_pidfile_info(self):
    144         """\
    145         After completion, self._state will contain:
    146          pid=None, exit_status=None if autoserv has not yet run
    147          pid!=None, exit_status=None if autoserv is running
    148          pid!=None, exit_status!=None if autoserv has completed
    149         """
    150         try:
    151             self._get_pidfile_info_helper()
    152         except self._PidfileException, exc:
    153             self._handle_pidfile_error('Pidfile error', traceback.format_exc())
    154 
    155 
    156     def _handle_no_process(self):
    157         """\
    158         Called when no pidfile is found or no pid is in the pidfile.
    159         """
    160         if time.time() - self._start_time > _get_pidfile_timeout_secs():
    161             # If we aborted the process, and we find that it has exited without
    162             # writing a pidfile, then it's because we killed it, and thus this
    163             # isn't a surprising situation.
    164             if not self._killed:
    165                 metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile'
    166                                 ).increment()
    167             else:
    168                 logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
    169             self.on_lost_process()
    170 
    171 
    172     def on_lost_process(self, process=None):
    173         """\
    174         Called when autoserv has exited without writing an exit status,
    175         or we've timed out waiting for autoserv to write a pid to the
    176         pidfile.  In either case, we just return failure and the caller
    177         should signal some kind of warning.
    178 
    179         process is unimportant here, as it shouldn't be used by anyone.
    180         """
    181         self.lost_process = True
    182         self._state.process = process
    183         self._state.exit_status = 1
    184         self._state.num_tests_failed = 0
    185 
    186 
    187     def exit_code(self):
    188         self._get_pidfile_info()
    189         return self._state.exit_status
    190 
    191 
    192     def num_tests_failed(self):
    193         """@returns The number of tests that failed or -1 if unknown."""
    194         self._get_pidfile_info()
    195         if self._state.num_tests_failed is None:
    196             return -1
    197         return self._state.num_tests_failed
    198 
    199 
    200     def try_copy_results_on_drone(self, **kwargs):
    201         if self.has_process():
    202             # copy results logs into the normal place for job results
    203             self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
    204 
    205 
    206     def try_copy_to_results_repository(self, source, **kwargs):
    207         if self.has_process():
    208             self._drone_manager.copy_to_results_repository(self.get_process(),
    209                                                       source, **kwargs)
    210 
    211