Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 import datetime as datetime_base
      8 import logging
      9 from datetime import datetime
     10 
     11 import common
     12 
     13 from autotest_lib.client.common_lib import global_config
     14 from autotest_lib.client.common_lib import time_utils
     15 from autotest_lib.server import utils
     16 from autotest_lib.server.cros.dynamic_suite import reporting_utils
     17 from autotest_lib.server.lib import status_history
     18 
     19 CONFIG = global_config.global_config
     20 
     21 
     22 class DUTsNotAvailableError(utils.TestLabException):
     23     """Raised when a DUT label combination is not available in the lab."""
     24 
     25 
     26 class NotEnoughDutsError(utils.TestLabException):
     27     """Rasied when the lab doesn't have the minimum number of duts."""
     28 
     29     def __init__(self, labels, num_available, num_required, hosts):
     30         """Initialize instance.
     31 
     32         Please pass arguments by keyword.
     33 
     34         @param labels: Labels required, including board an pool labels.
     35         @param num_available: Number of available hosts.
     36         @param num_required: Number of hosts required.
     37         @param hosts: Sequence of Host instances for given board and pool.
     38         """
     39         self.labels = labels
     40         self.num_available = num_available
     41         self.num_required = num_required
     42         self.hosts = hosts
     43         self.bug_id = None
     44         self.suite_name = None
     45         self.build = None
     46 
     47 
     48     def __repr__(self):
     49         return (
     50             '<{cls} at 0x{id:x} with'
     51             ' labels={this.labels!r},'
     52             ' num_available={this.num_available!r},'
     53             ' num_required={this.num_required!r},'
     54             ' bug_id={this.bug_id!r},'
     55             ' suite_name={this.suite_name!r},'
     56             ' build={this.build!r}>'
     57             .format(cls=type(self).__name__, id=id(self), this=self)
     58         )
     59 
     60 
     61     def __str__(self):
     62         msg_parts = [
     63             'Not enough DUTs for requirements: {this.labels};'
     64             ' required: {this.num_required}, found: {this.num_available}'
     65         ]
     66         format_dict = {'this': self}
     67         if self.bug_id is not None:
     68             msg_parts.append('bug: {bug_url}')
     69             format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
     70         if self.suite_name is not None:
     71             msg_parts.append('suite: {this.suite_name}')
     72         if self.build is not None:
     73             msg_parts.append('build: {this.build}')
     74         return ', '.join(msg_parts).format(**format_dict)
     75 
     76 
     77     def add_bug_id(self, bug_id):
     78         """Add crbug id associated with this exception.
     79 
     80         @param bug_id  crbug id whose str() value is used in a crbug URL.
     81         """
     82         self.bug_id = bug_id
     83 
     84 
     85     def add_suite_name(self, suite_name):
     86         """Add name of test suite that needed the DUTs.
     87 
     88         @param suite_name  Name of test suite.
     89         """
     90         self.suite_name = suite_name
     91 
     92 
     93     def add_build(self, build):
     94         """Add name of build of job that needed the DUTs.
     95 
     96         @param build  Name of build.
     97         """
     98         self.build = build
     99 
    100 
    101 class SimpleTimer(object):
    102     """A simple timer used to periodically check if a deadline has passed."""
    103 
    104     def _reset(self):
    105         """Reset the deadline."""
    106         if not self.interval_hours or self.interval_hours < 0:
    107             logging.error('Bad interval %s', self.interval_hours)
    108             self.deadline = None
    109             return
    110         self.deadline = datetime.now() + datetime_base.timedelta(
    111                 hours=self.interval_hours)
    112 
    113 
    114     def __init__(self, interval_hours=0.5):
    115         """Initialize a simple periodic deadline timer.
    116 
    117         @param interval_hours: Interval of the deadline.
    118         """
    119         self.interval_hours = interval_hours
    120         self._reset()
    121 
    122 
    123     def poll(self):
    124         """Poll the timer to see if we've hit the deadline.
    125 
    126         This method resets the deadline if it has passed. If the deadline
    127         hasn't been set, or the current time is less than the deadline, the
    128         method returns False.
    129 
    130         @return: True if the deadline has passed, False otherwise.
    131         """
    132         if not self.deadline or datetime.now() < self.deadline:
    133             return False
    134         self._reset()
    135         return True
    136 
    137 
    138 class JobTimer(object):
    139     """Utility class capable of measuring job timeouts.
    140     """
    141 
    142     # Format used in datetime - string conversion.
    143     time_format = '%m-%d-%Y [%H:%M:%S]'
    144 
    145     def __init__(self, job_created_time, timeout_mins):
    146         """JobTimer constructor.
    147 
    148         @param job_created_time: float representing the time a job was
    149             created. Eg: time.time()
    150         @param timeout_mins: float representing the timeout in minutes.
    151         """
    152         self.job_created_time = datetime.fromtimestamp(job_created_time)
    153         self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
    154         self.debug_output_timer = SimpleTimer(interval_hours=0.5)
    155         self.past_halftime = False
    156 
    157 
    158     @classmethod
    159     def format_time(cls, datetime_obj):
    160         """Get the string formatted version of the datetime object.
    161 
    162         @param datetime_obj: A datetime.datetime object.
    163             Eg: datetime.datetime.now()
    164 
    165         @return: A formatted string containing the date/time of the
    166             input datetime.
    167         """
    168         return datetime_obj.strftime(cls.time_format)
    169 
    170 
    171     def elapsed_time(self):
    172         """Get the time elapsed since this job was created.
    173 
    174         @return: A timedelta object representing the elapsed time.
    175         """
    176         return datetime.now() - self.job_created_time
    177 
    178 
    179     def is_suite_timeout(self):
    180         """Check if the suite timed out.
    181 
    182         @return: True if more than timeout_hours has elapsed since the suite job
    183             was created.
    184         """
    185         if self.elapsed_time() >= self.timeout_hours:
    186             logging.info('Suite timed out. Started on %s, timed out on %s',
    187                          self.format_time(self.job_created_time),
    188                          self.format_time(datetime.now()))
    189             return True
    190         return False
    191 
    192 
    193     def first_past_halftime(self):
    194         """Check if we just crossed half time.
    195 
    196         This method will only return True once, the first time it is called
    197         after a job's elapsed time is past half its timeout.
    198 
    199         @return True: If this is the first call of the method after halftime.
    200         """
    201         if (not self.past_halftime and
    202             self.elapsed_time() > self.timeout_hours/2):
    203             self.past_halftime = True
    204             return True
    205         return False
    206 
    207 
    208 class RPCHelper(object):
    209     """A class to help diagnose a suite run through the rpc interface.
    210     """
    211 
    212     def __init__(self, rpc_interface):
    213         """Constructor for rpc helper class.
    214 
    215         @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
    216         """
    217         self.rpc_interface = rpc_interface
    218 
    219 
    220     def diagnose_pool(self, labels, time_delta_hours, limit=10):
    221         """Log diagnostic information about a timeout for a board/pool.
    222 
    223         @param labels: DUT label dependencies, including board and pool
    224                        labels.
    225         @param time_delta_hours: The time from which we should log information.
    226             This is a datetime.timedelta object, as stored by the JobTimer.
    227         @param limit: The maximum number of jobs per host, to log.
    228 
    229         @raises proxy.JSONRPCException: For exceptions thrown across the wire.
    230         """
    231         end_time = datetime.now()
    232         start_time = end_time - time_delta_hours
    233         host_histories = status_history.HostJobHistory.get_multiple_histories(
    234                 self.rpc_interface,
    235                 time_utils.to_epoch_time(start_time),
    236                 time_utils.to_epoch_time(end_time),
    237                 labels,
    238         )
    239         if not host_histories:
    240             logging.error('No hosts found for labels %r', labels)
    241             return
    242         status_map = {
    243             status_history.UNUSED: 'Unused',
    244             status_history.UNKNOWN: 'No job history',
    245             status_history.WORKING: 'Working',
    246             status_history.BROKEN: 'Failed repair'
    247         }
    248         for history in host_histories:
    249             count = 0
    250             job_info =''
    251             for job in history:
    252                 start_time = (
    253                         time_utils.epoch_time_to_date_string(job.start_time))
    254                 job_info += ('%s %s started on: %s status %s\n' %
    255                         (job.id, job.name, start_time, job.job_status))
    256                 count += 1
    257                 if count >= limit:
    258                     break
    259             host = history.host
    260             logging.error('host: %s, status: %s, locked: %s '
    261                           'diagnosis: %s\n'
    262                           'labels: %s\nLast %s jobs within %s:\n'
    263                           '%s',
    264                           history.hostname, host.status, host.locked,
    265                           status_map[history.last_diagnosis()[0]],
    266                           host.labels, limit, time_delta_hours,
    267                           job_info)
    268 
    269 
    270     def check_dut_availability(self, labels, minimum_duts=0,
    271                                skip_duts_check=False):
    272         """Check if DUT availability for a given board and pool is less than
    273         minimum.
    274 
    275         @param labels: DUT label dependencies, including board and pool
    276                        labels.
    277         @param minimum_duts: Minimum Number of available machines required to
    278                              run the suite. Default is set to 0, which means do
    279                              not force the check of available machines before
    280                              running the suite.
    281         @param skip_duts_check: If True, skip minimum available DUTs check.
    282         @raise: NotEnoughDutsError if DUT availability is lower than minimum.
    283         @raise: DUTsNotAvailableError if no host found for requested
    284                 board/pool.
    285         """
    286         if minimum_duts == 0:
    287             return
    288 
    289         hosts = self.rpc_interface.get_hosts(
    290                 invalid=False, multiple_labels=labels)
    291         if not hosts:
    292             raise DUTsNotAvailableError(
    293                     'No hosts found for labels %r. The test lab '
    294                     'currently does not cover test for those DUTs.' %
    295                     (labels,))
    296 
    297         if skip_duts_check:
    298             # Bypass minimum avilable DUTs check
    299             logging.debug('skip_duts_check is on, do not enforce minimum '
    300                           'DUTs check.')
    301             return
    302 
    303         if len(hosts) < minimum_duts:
    304             logging.debug('The total number of DUTs for %r is %d, '
    305                           'which is less than %d, the required minimum '
    306                           'number of available DUTS', labels, len(hosts),
    307                           minimum_duts)
    308 
    309         available_hosts = 0
    310         for host in hosts:
    311             if host.is_available():
    312                 available_hosts += 1
    313         logging.debug('%d of %d DUTs are available for %r.',
    314                       available_hosts, len(hosts), labels)
    315         if available_hosts < minimum_duts:
    316             raise NotEnoughDutsError(
    317                 labels=labels,
    318                 num_available=available_hosts,
    319                 num_required=minimum_duts,
    320                 hosts=hosts)
    321 
    322 
    323     def diagnose_job(self, job_id, instance_server):
    324         """Diagnose a suite job.
    325 
    326         Logs information about the jobs that are still to run in the suite.
    327 
    328         @param job_id: The id of the suite job to get information about.
    329             No meaningful information gets logged if the id is for a sub-job.
    330         @param instance_server: The instance server.
    331             Eg: cautotest, cautotest-cq, localhost.
    332         """
    333         incomplete_jobs = self.rpc_interface.get_jobs(
    334                 parent_job_id=job_id, summary=True,
    335                 hostqueueentry__complete=False)
    336         if incomplete_jobs:
    337             logging.info('\n%s printing summary of incomplete jobs (%s):\n',
    338                          JobTimer.format_time(datetime.now()),
    339                          len(incomplete_jobs))
    340             for job in incomplete_jobs:
    341                 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
    342                              reporting_utils.link_job(job.id, instance_server))
    343         else:
    344             logging.info('All jobs in suite have already completed.')
    345