Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 import datetime as datetime_base
      8 import logging
      9 from datetime import datetime
     10 
     11 import common
     12 
     13 from autotest_lib.client.common_lib import global_config
     14 from autotest_lib.client.common_lib import time_utils
     15 from autotest_lib.server import utils
     16 from autotest_lib.server.cros.dynamic_suite import reporting_utils
     17 from autotest_lib.server.lib import status_history
     18 from autotest_lib.utils import labellib
     19 
     20 CONFIG = global_config.global_config
     21 
     22 
     23 class BoardNotAvailableError(utils.TestLabException):
     24     """Raised when a board is not available in the lab."""
     25 
     26 
     27 class NotEnoughDutsError(utils.TestLabException):
     28     """Rasied when the lab doesn't have the minimum number of duts."""
     29 
     30     def __init__(self, board, pool, num_available, num_required, hosts):
     31         """Initialize instance.
     32 
     33         Please pass arguments by keyword.
     34 
     35         @param board: Name of board.
     36         @param pool: Name of pool.
     37         @param num_available: Number of available hosts.
     38         @param num_required: Number of hosts required.
     39         @param hosts: Sequence of Host instances for given board and pool.
     40         """
     41         self.board = board
     42         self.pool = pool
     43         self.num_available = num_available
     44         self.num_required = num_required
     45         self.hosts = hosts
     46         self.bug_id = None
     47         self.suite_name = None
     48         self.build = None
     49 
     50 
     51     def __repr__(self):
     52         return (
     53             '<{cls} at 0x{id:x} with'
     54             ' board={this.board!r},'
     55             ' pool={this.pool!r},'
     56             ' num_available={this.num_available!r},'
     57             ' num_required={this.num_required!r},'
     58             ' bug_id={this.bug_id!r},'
     59             ' suite_name={this.suite_name!r},'
     60             ' build={this.build!r}>'
     61             .format(cls=type(self).__name__, id=id(self), this=self)
     62         )
     63 
     64 
     65     def __str__(self):
     66         msg_parts = [
     67             'Not enough DUTs for board: {this.board}, pool: {this.pool};'
     68             ' required: {this.num_required}, found: {this.num_available}'
     69         ]
     70         format_dict = {'this': self}
     71         if self.bug_id is not None:
     72             msg_parts.append('bug: {bug_url}')
     73             format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
     74         if self.suite_name is not None:
     75             msg_parts.append('suite: {this.suite_name}')
     76         if self.build is not None:
     77             msg_parts.append('build: {this.build}')
     78         return ', '.join(msg_parts).format(**format_dict)
     79 
     80 
     81     def add_bug_id(self, bug_id):
     82         """Add crbug id associated with this exception.
     83 
     84         @param bug_id  crbug id whose str() value is used in a crbug URL.
     85         """
     86         self.bug_id = bug_id
     87 
     88 
     89     def add_suite_name(self, suite_name):
     90         """Add name of test suite that needed the DUTs.
     91 
     92         @param suite_name  Name of test suite.
     93         """
     94         self.suite_name = suite_name
     95 
     96 
     97     def add_build(self, build):
     98         """Add name of build of job that needed the DUTs.
     99 
    100         @param build  Name of build.
    101         """
    102         self.build = build
    103 
    104 
    105 class SimpleTimer(object):
    106     """A simple timer used to periodically check if a deadline has passed."""
    107 
    108     def _reset(self):
    109         """Reset the deadline."""
    110         if not self.interval_hours or self.interval_hours < 0:
    111             logging.error('Bad interval %s', self.interval_hours)
    112             self.deadline = None
    113             return
    114         self.deadline = datetime.now() + datetime_base.timedelta(
    115                 hours=self.interval_hours)
    116 
    117 
    118     def __init__(self, interval_hours=0.5):
    119         """Initialize a simple periodic deadline timer.
    120 
    121         @param interval_hours: Interval of the deadline.
    122         """
    123         self.interval_hours = interval_hours
    124         self._reset()
    125 
    126 
    127     def poll(self):
    128         """Poll the timer to see if we've hit the deadline.
    129 
    130         This method resets the deadline if it has passed. If the deadline
    131         hasn't been set, or the current time is less than the deadline, the
    132         method returns False.
    133 
    134         @return: True if the deadline has passed, False otherwise.
    135         """
    136         if not self.deadline or datetime.now() < self.deadline:
    137             return False
    138         self._reset()
    139         return True
    140 
    141 
    142 class JobTimer(object):
    143     """Utility class capable of measuring job timeouts.
    144     """
    145 
    146     # Format used in datetime - string conversion.
    147     time_format = '%m-%d-%Y [%H:%M:%S]'
    148 
    149     def __init__(self, job_created_time, timeout_mins):
    150         """JobTimer constructor.
    151 
    152         @param job_created_time: float representing the time a job was
    153             created. Eg: time.time()
    154         @param timeout_mins: float representing the timeout in minutes.
    155         """
    156         self.job_created_time = datetime.fromtimestamp(job_created_time)
    157         self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
    158         self.debug_output_timer = SimpleTimer(interval_hours=0.5)
    159         self.past_halftime = False
    160 
    161 
    162     @classmethod
    163     def format_time(cls, datetime_obj):
    164         """Get the string formatted version of the datetime object.
    165 
    166         @param datetime_obj: A datetime.datetime object.
    167             Eg: datetime.datetime.now()
    168 
    169         @return: A formatted string containing the date/time of the
    170             input datetime.
    171         """
    172         return datetime_obj.strftime(cls.time_format)
    173 
    174 
    175     def elapsed_time(self):
    176         """Get the time elapsed since this job was created.
    177 
    178         @return: A timedelta object representing the elapsed time.
    179         """
    180         return datetime.now() - self.job_created_time
    181 
    182 
    183     def is_suite_timeout(self):
    184         """Check if the suite timed out.
    185 
    186         @return: True if more than timeout_hours has elapsed since the suite job
    187             was created.
    188         """
    189         if self.elapsed_time() >= self.timeout_hours:
    190             logging.info('Suite timed out. Started on %s, timed out on %s',
    191                          self.format_time(self.job_created_time),
    192                          self.format_time(datetime.now()))
    193             return True
    194         return False
    195 
    196 
    197     def first_past_halftime(self):
    198         """Check if we just crossed half time.
    199 
    200         This method will only return True once, the first time it is called
    201         after a job's elapsed time is past half its timeout.
    202 
    203         @return True: If this is the first call of the method after halftime.
    204         """
    205         if (not self.past_halftime and
    206             self.elapsed_time() > self.timeout_hours/2):
    207             self.past_halftime = True
    208             return True
    209         return False
    210 
    211 
    212 class RPCHelper(object):
    213     """A class to help diagnose a suite run through the rpc interface.
    214     """
    215 
    216     def __init__(self, rpc_interface):
    217         """Constructor for rpc helper class.
    218 
    219         @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
    220         """
    221         self.rpc_interface = rpc_interface
    222 
    223 
    224     def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
    225         """Log diagnostic information about a timeout for a board/pool.
    226 
    227         @param board: The board for which the current suite was run.
    228         @param pool: The pool against which the current suite was run.
    229         @param time_delta_hours: The time from which we should log information.
    230             This is a datetime.timedelta object, as stored by the JobTimer.
    231         @param limit: The maximum number of jobs per host, to log.
    232 
    233         @raises proxy.JSONRPCException: For exceptions thrown across the wire.
    234         """
    235         end_time = datetime.now()
    236         start_time = end_time - time_delta_hours
    237         labels = labellib.LabelsMapping()
    238         labels['board'] = board
    239         labels['pool'] = pool
    240         host_histories = status_history.HostJobHistory.get_multiple_histories(
    241                 self.rpc_interface,
    242                 time_utils.to_epoch_time(start_time),
    243                 time_utils.to_epoch_time(end_time),
    244                 labels.getlabels(),
    245         )
    246         if not host_histories:
    247             logging.error('No hosts found for board:%s in pool:%s',
    248                             board, pool)
    249             return
    250         status_map = {
    251             status_history.UNUSED: 'Unused',
    252             status_history.UNKNOWN: 'No job history',
    253             status_history.WORKING: 'Working',
    254             status_history.BROKEN: 'Failed repair'
    255         }
    256         for history in host_histories:
    257             count = 0
    258             job_info =''
    259             for job in history:
    260                 start_time = (
    261                         time_utils.epoch_time_to_date_string(job.start_time))
    262                 job_info += ('%s %s started on: %s status %s\n' %
    263                         (job.id, job.name, start_time, job.job_status))
    264                 count += 1
    265                 if count >= limit:
    266                     break
    267             host = history.host
    268             logging.error('host: %s, status: %s, locked: %s '
    269                           'diagnosis: %s\n'
    270                           'labels: %s\nLast %s jobs within %s:\n'
    271                           '%s',
    272                           history.hostname, host.status, host.locked,
    273                           status_map[history.last_diagnosis()[0]],
    274                           host.labels, limit, time_delta_hours,
    275                           job_info)
    276 
    277 
    278     def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
    279         """Check if DUT availability for a given board and pool is less than
    280         minimum.
    281 
    282         @param board: The board to check DUT availability.
    283         @param pool: The pool to check DUT availability.
    284         @param minimum_duts: Minimum Number of available machines required to
    285                              run the suite. Default is set to 0, which means do
    286                              not force the check of available machines before
    287                              running the suite.
    288         @param skip_duts_check: If True, skip minimum available DUTs check.
    289         @raise: NotEnoughDutsError if DUT availability is lower than minimum.
    290         @raise: BoardNotAvailableError if no host found for requested
    291                 board/pool.
    292         """
    293         if minimum_duts == 0:
    294             return
    295 
    296         # TODO(ayatane): Replace label prefixes with constants in
    297         # autotest_lib.server.constants
    298         hosts = self.rpc_interface.get_hosts(
    299                 invalid=False,
    300                 multiple_labels=('pool:%s' % pool, 'board:%s' % board))
    301         if not hosts:
    302             raise BoardNotAvailableError(
    303                     'No hosts found for board:%s in pool:%s. The test lab '
    304                     'currently does not cover test for this board and pool.'%
    305                     (board, pool))
    306 
    307         if skip_duts_check:
    308             # Bypass minimum avilable DUTs check
    309             logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
    310             return
    311 
    312         if len(hosts) < minimum_duts:
    313             logging.debug('The total number of DUTs for %s in pool:%s is %d, '
    314                           'which is less than %d, the required minimum number of'
    315                           ' available DUTS', board, pool, len(hosts),
    316                           minimum_duts)
    317 
    318         available_hosts = 0
    319         for host in hosts:
    320             if host.is_available():
    321                 available_hosts += 1
    322         logging.debug('%d of %d DUTs are available for board %s pool %s.',
    323                       available_hosts, len(hosts), board, pool)
    324         if available_hosts < minimum_duts:
    325             raise NotEnoughDutsError(
    326                 board=board,
    327                 pool=pool,
    328                 num_available=available_hosts,
    329                 num_required=minimum_duts,
    330                 hosts=hosts)
    331 
    332 
    333     def diagnose_job(self, job_id, instance_server):
    334         """Diagnose a suite job.
    335 
    336         Logs information about the jobs that are still to run in the suite.
    337 
    338         @param job_id: The id of the suite job to get information about.
    339             No meaningful information gets logged if the id is for a sub-job.
    340         @param instance_server: The instance server.
    341             Eg: cautotest, cautotest-cq, localhost.
    342         """
    343         incomplete_jobs = self.rpc_interface.get_jobs(
    344                 parent_job_id=job_id, summary=True,
    345                 hostqueueentry__complete=False)
    346         if incomplete_jobs:
    347             logging.info('\n%s printing summary of incomplete jobs (%s):\n',
    348                          JobTimer.format_time(datetime.now()),
    349                          len(incomplete_jobs))
    350             for job in incomplete_jobs:
    351                 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
    352                              reporting_utils.link_job(job.id, instance_server))
    353         else:
    354             logging.info('All jobs in suite have already completed.')
    355