Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 import datetime as datetime_base
      8 import logging
      9 from datetime import datetime
     10 
     11 import common
     12 
     13 from autotest_lib.client.common_lib import global_config
     14 from autotest_lib.client.common_lib import host_states
     15 from autotest_lib.client.common_lib import time_utils
     16 from autotest_lib.server import utils
     17 from autotest_lib.server.cros.dynamic_suite import reporting_utils
     18 from autotest_lib.server.lib import status_history
     19 
     20 CONFIG = global_config.global_config
     21 
     22 
     23 class BoardNotAvailableError(utils.TestLabException):
     24     """Raised when a board is not available in the lab."""
     25 
     26 
     27 class NotEnoughDutsError(utils.TestLabException):
     28     """Rasied when the lab doesn't have the minimum number of duts."""
     29 
     30     def __init__(self, board, pool, num_available, num_required, hosts):
     31         """Initialize instance.
     32 
     33         Please pass arguments by keyword.
     34 
     35         @param board: Name of board.
     36         @param pool: Name of pool.
     37         @param num_available: Number of available hosts.
     38         @param num_required: Number of hosts required.
     39         @param hosts: Sequence of Host instances for given board and pool.
     40         """
     41         self.board = board
     42         self.pool = pool
     43         self.num_available = num_available
     44         self.num_required = num_required
     45         self.hosts = hosts
     46         self.bug_id = None
     47         self.suite_name = None
     48         self.build = None
     49 
     50 
     51     def __repr__(self):
     52         return (
     53             '<{cls} at 0x{id:x} with'
     54             ' board={this.board!r},'
     55             ' pool={this.pool!r},'
     56             ' num_available={this.num_available!r},'
     57             ' num_required={this.num_required!r},'
     58             ' bug_id={this.bug_id!r},'
     59             ' suite_name={this.suite_name!r},'
     60             ' build={this.build!r}>'
     61             .format(cls=type(self).__name__, id=id(self), this=self)
     62         )
     63 
     64 
     65     def __str__(self):
     66         msg_parts = [
     67             'Not enough DUTs for board: {this.board}, pool: {this.pool};'
     68             ' required: {this.num_required}, found: {this.num_available}'
     69         ]
     70         format_dict = {'this': self}
     71         if self.bug_id is not None:
     72             msg_parts.append('bug: {bug_url}')
     73             format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
     74         if self.suite_name is not None:
     75             msg_parts.append('suite: {this.suite_name}')
     76         if self.build is not None:
     77             msg_parts.append('build: {this.build}')
     78         return ', '.join(msg_parts).format(**format_dict)
     79 
     80 
     81     def add_bug_id(self, bug_id):
     82         """Add crbug id associated with this exception.
     83 
     84         @param bug_id  crbug id whose str() value is used in a crbug URL.
     85         """
     86         self.bug_id = bug_id
     87 
     88 
     89     def add_suite_name(self, suite_name):
     90         """Add name of test suite that needed the DUTs.
     91 
     92         @param suite_name  Name of test suite.
     93         """
     94         self.suite_name = suite_name
     95 
     96 
     97     def add_build(self, build):
     98         """Add name of build of job that needed the DUTs.
     99 
    100         @param build  Name of build.
    101         """
    102         self.build = build
    103 
    104 
    105 class SimpleTimer(object):
    106     """A simple timer used to periodically check if a deadline has passed."""
    107 
    108     def _reset(self):
    109         """Reset the deadline."""
    110         if not self.interval_hours or self.interval_hours < 0:
    111             logging.error('Bad interval %s', self.interval_hours)
    112             self.deadline = None
    113             return
    114         self.deadline = datetime.now() + datetime_base.timedelta(
    115                 hours=self.interval_hours)
    116 
    117 
    118     def __init__(self, interval_hours=0.5):
    119         """Initialize a simple periodic deadline timer.
    120 
    121         @param interval_hours: Interval of the deadline.
    122         """
    123         self.interval_hours = interval_hours
    124         self._reset()
    125 
    126 
    127     def poll(self):
    128         """Poll the timer to see if we've hit the deadline.
    129 
    130         This method resets the deadline if it has passed. If the deadline
    131         hasn't been set, or the current time is less than the deadline, the
    132         method returns False.
    133 
    134         @return: True if the deadline has passed, False otherwise.
    135         """
    136         if not self.deadline or datetime.now() < self.deadline:
    137             return False
    138         self._reset()
    139         return True
    140 
    141 
    142 class JobTimer(object):
    143     """Utility class capable of measuring job timeouts.
    144     """
    145 
    146     # Format used in datetime - string conversion.
    147     time_format = '%m-%d-%Y [%H:%M:%S]'
    148 
    149     def __init__(self, job_created_time, timeout_mins):
    150         """JobTimer constructor.
    151 
    152         @param job_created_time: float representing the time a job was
    153             created. Eg: time.time()
    154         @param timeout_mins: float representing the timeout in minutes.
    155         """
    156         self.job_created_time = datetime.fromtimestamp(job_created_time)
    157         self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
    158         self.debug_output_timer = SimpleTimer(interval_hours=0.5)
    159         self.past_halftime = False
    160 
    161 
    162     @classmethod
    163     def format_time(cls, datetime_obj):
    164         """Get the string formatted version of the datetime object.
    165 
    166         @param datetime_obj: A datetime.datetime object.
    167             Eg: datetime.datetime.now()
    168 
    169         @return: A formatted string containing the date/time of the
    170             input datetime.
    171         """
    172         return datetime_obj.strftime(cls.time_format)
    173 
    174 
    175     def elapsed_time(self):
    176         """Get the time elapsed since this job was created.
    177 
    178         @return: A timedelta object representing the elapsed time.
    179         """
    180         return datetime.now() - self.job_created_time
    181 
    182 
    183     def is_suite_timeout(self):
    184         """Check if the suite timed out.
    185 
    186         @return: True if more than timeout_hours has elapsed since the suite job
    187             was created.
    188         """
    189         if self.elapsed_time() >= self.timeout_hours:
    190             logging.info('Suite timed out. Started on %s, timed out on %s',
    191                          self.format_time(self.job_created_time),
    192                          self.format_time(datetime.now()))
    193             return True
    194         return False
    195 
    196 
    197     def first_past_halftime(self):
    198         """Check if we just crossed half time.
    199 
    200         This method will only return True once, the first time it is called
    201         after a job's elapsed time is past half its timeout.
    202 
    203         @return True: If this is the first call of the method after halftime.
    204         """
    205         if (not self.past_halftime and
    206             self.elapsed_time() > self.timeout_hours/2):
    207             self.past_halftime = True
    208             return True
    209         return False
    210 
    211 
    212 class RPCHelper(object):
    213     """A class to help diagnose a suite run through the rpc interface.
    214     """
    215 
    216     def __init__(self, rpc_interface):
    217         """Constructor for rpc helper class.
    218 
    219         @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
    220         """
    221         self.rpc_interface = rpc_interface
    222 
    223 
    224     def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
    225         """Log diagnostic information about a timeout for a board/pool.
    226 
    227         @param board: The board for which the current suite was run.
    228         @param pool: The pool against which the current suite was run.
    229         @param time_delta_hours: The time from which we should log information.
    230             This is a datetime.timedelta object, as stored by the JobTimer.
    231         @param limit: The maximum number of jobs per host, to log.
    232 
    233         @raises proxy.JSONRPCException: For exceptions thrown across the wire.
    234         """
    235         end_time = datetime.now()
    236         start_time = end_time - time_delta_hours
    237         get_histories = status_history.HostJobHistory.get_multiple_histories
    238         host_histories = get_histories(
    239                 self.rpc_interface,
    240                 time_utils.to_epoch_time(start_time),
    241                 time_utils.to_epoch_time(end_time),
    242                 board=board, pool=pool)
    243         if not host_histories:
    244             logging.error('No hosts found for board:%s in pool:%s',
    245                             board, pool)
    246             return
    247         status_map = {
    248             status_history.UNUSED: 'Unused',
    249             status_history.UNKNOWN: 'No job history',
    250             status_history.WORKING: 'Working',
    251             status_history.BROKEN: 'Failed repair'
    252         }
    253         for history in host_histories:
    254             count = 0
    255             job_info =''
    256             for job in history:
    257                 start_time = (
    258                         time_utils.epoch_time_to_date_string(job.start_time))
    259                 job_info += ('%s %s started on: %s status %s\n' %
    260                         (job.id, job.name, start_time, job.job_status))
    261                 count += 1
    262                 if count >= limit:
    263                     break
    264             host = history.host
    265             logging.error('host: %s, status: %s, locked: %s '
    266                           'diagnosis: %s\n'
    267                           'labels: %s\nLast %s jobs within %s:\n'
    268                           '%s',
    269                           history.hostname, host.status, host.locked,
    270                           status_map[history.last_diagnosis()[0]],
    271                           host.labels, limit, time_delta_hours,
    272                           job_info)
    273 
    274 
    275     def _is_host_available(self, host):
    276         """Check whether DUT host is available.
    277 
    278         @param host: The Host instance for the DUT.
    279         @return: bool
    280         """
    281         return not (host.locked or host.status in host_states.UNAVAILABLE_STATES)
    282 
    283 
    284     def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
    285         """Check if DUT availability for a given board and pool is less than
    286         minimum.
    287 
    288         @param board: The board to check DUT availability.
    289         @param pool: The pool to check DUT availability.
    290         @param minimum_duts: Minimum Number of available machines required to
    291                              run the suite. Default is set to 0, which means do
    292                              not force the check of available machines before
    293                              running the suite.
    294         @param skip_duts_check: If True, skip minimum available DUTs check.
    295         @raise: NotEnoughDutsError if DUT availability is lower than minimum.
    296         @raise: BoardNotAvailableError if no host found for requested
    297                 board/pool.
    298         """
    299         if minimum_duts == 0:
    300             return
    301 
    302         # TODO(ayatane): Replace label prefixes with constants in
    303         # site_utils.suite_scheduler.constants
    304         hosts = self.rpc_interface.get_hosts(
    305                 invalid=False,
    306                 multiple_labels=('pool:%s' % pool, 'board:%s' % board))
    307         if not hosts:
    308             raise BoardNotAvailableError(
    309                     'No hosts found for board:%s in pool:%s. The test lab '
    310                     'currently does not cover test for this board and pool.'%
    311                     (board, pool))
    312 
    313         if skip_duts_check:
    314             # Bypass minimum avilable DUTs check
    315             logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
    316             return
    317 
    318         if len(hosts) < minimum_duts:
    319             logging.debug('The total number of DUTs for %s in pool:%s is %d, '
    320                           'which is less than %d, the required minimum number of'
    321                           ' available DUTS', board, pool, len(hosts),
    322                           minimum_duts)
    323 
    324         available_hosts = 0
    325         for host in hosts:
    326             if self._is_host_available(host):
    327                 available_hosts += 1
    328         logging.debug('%d of %d DUTs are available for board %s pool %s.',
    329                       available_hosts, len(hosts), board, pool)
    330         if available_hosts < minimum_duts:
    331             raise NotEnoughDutsError(
    332                 board=board,
    333                 pool=pool,
    334                 num_available=available_hosts,
    335                 num_required=minimum_duts,
    336                 hosts=hosts)
    337 
    338 
    339     def diagnose_job(self, job_id, instance_server):
    340         """Diagnose a suite job.
    341 
    342         Logs information about the jobs that are still to run in the suite.
    343 
    344         @param job_id: The id of the suite job to get information about.
    345             No meaningful information gets logged if the id is for a sub-job.
    346         @param instance_server: The instance server.
    347             Eg: cautotest, cautotest-cq, localhost.
    348         """
    349         incomplete_jobs = self.rpc_interface.get_jobs(
    350                 parent_job_id=job_id, summary=True,
    351                 hostqueueentry__complete=False)
    352         if incomplete_jobs:
    353             logging.info('\n%s printing summary of incomplete jobs (%s):\n',
    354                          JobTimer.format_time(datetime.now()),
    355                          len(incomplete_jobs))
    356             for job in incomplete_jobs:
    357                 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
    358                              reporting_utils.link_job(job.id, instance_server))
    359         else:
    360             logging.info('All jobs in suite have already completed.')
    361