Home | History | Annotate | Download | only in site_utils
      1 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """Services relating to DUT status and job history.
      6 
      7 The central abstraction of this module is the `HostJobHistory`
      8 class.  This class provides two related pieces of information
      9 regarding a single DUT:
     10   * A history of tests and special tasks that have run on
     11     the DUT in a given time range.
     12   * Whether the DUT was "working" or "broken" at a given
     13     time.
     14 
     15 The "working" or "broken" status of a DUT is determined by
     16 the DUT's special task history.  At the end of any job or
     17 task, the status is indicated as follows:
     18   * After any successful special task, the DUT is considered
     19     "working".
     20   * After any failed Repair task, the DUT is considered "broken".
     21   * After any other special task or after any regular test job, the
     22     DUT's status is considered unchanged.
     23 
     24 Definitions for terms used in the code below:
     25   * status task - Any special task that determines the DUT's
     26     status; that is, any successful task, or any failed Repair.
     27   * diagnosis interval - A time interval during which DUT status
     28     changed either from "working" to "broken", or vice versa.  The
     29     interval starts with the last status task with the old status,
     30     and ends after the first status task with the new status.
     31 
     32 Diagnosis intervals are interesting because they normally contain
     33 the logs explaining a failure or repair event.
     34 
     35 """
     36 
     37 import common
     38 from autotest_lib.frontend import setup_django_environment
     39 from django.db import models as django_models
     40 
     41 from autotest_lib.client.common_lib import global_config
     42 from autotest_lib.client.common_lib import time_utils
     43 from autotest_lib.frontend.afe import models as afe_models
     44 from autotest_lib.site_utils.suite_scheduler import constants
     45 
     46 
     47 # Values used to describe the diagnosis of a DUT.  These values are
     48 # used to indicate both DUT status after a job or task, and also
     49 # diagnosis of whether the DUT was working at the end of a given
     50 # time interval.
     51 #
     52 # UNUSED:  Used when there are no events recorded in a given
     53 #     time interval.
     54 # UNKNOWN:  For an individual event, indicates that the DUT status
     55 #     is unchanged from the previous event.  For a time interval,
     56 #     indicates that the DUT's status can't be determined from the
     57 #     DUT's history.
     58 # WORKING:  Indicates that the DUT was working normally after the
     59 #     event, or at the end of the time interval.
     60 # BROKEN:  Indicates that the DUT needed manual repair after the
     61 #     event, or at the end of the time interval.
     62 #
     63 UNUSED = 0
     64 UNKNOWN = 1
     65 WORKING = 2
     66 BROKEN = 3
     67 
     68 
     69 def parse_time(time_string):
     70     """Parse time according to a canonical form.
     71 
     72     The "canonical" form is the form in which date/time
     73     values are stored in the database.
     74 
     75     @param time_string Time to be parsed.
     76     """
     77     return int(time_utils.to_epoch_time(time_string))
     78 
     79 
     80 class _JobEvent(object):
     81     """Information about an event in host history.
     82 
     83     This remembers the relevant data from a single event in host
     84     history.  An event is any change in DUT state caused by a job
     85     or special task.  The data captured are the start and end times
     86     of the event, the URL of logs to the job or task causing the
     87     event, and a diagnosis of whether the DUT was working or failed
     88     afterwards.
     89 
     90     This class is an adapter around the database model objects
     91     describing jobs and special tasks.  This is an abstract
     92     superclass, with concrete subclasses for `HostQueueEntry` and
     93     `SpecialTask` objects.
     94 
     95     @property start_time  Time the job or task began execution.
     96     @property end_time    Time the job or task finished execution.
     97     @property job_url     URL to the logs for the event's job.
     98     @property diagnosis   Working status of the DUT after the event.
     99 
    100     """
    101 
    102     get_config_value = global_config.global_config.get_config_value
    103     _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern')
    104 
    105     @classmethod
    106     def get_log_url(cls, afe_hostname, logdir):
    107         """Return a URL to job results.
    108 
    109         The URL is constructed from a base URL determined by the
    110         global config, plus the relative path of the job's log
    111         directory.
    112 
    113         @param afe_hostname Hostname for autotest frontend
    114         @param logdir Relative path of the results log directory.
    115 
    116         @return A URL to the requested results log.
    117 
    118         """
    119         return cls._LOG_URL_PATTERN % (afe_hostname, logdir)
    120 
    121 
    122     def __init__(self, start_time, end_time):
    123         self.start_time = parse_time(start_time)
    124         self.end_time = parse_time(end_time)
    125 
    126 
    127     def __cmp__(self, other):
    128         """Compare two jobs by their start time.
    129 
    130         This is a standard Python `__cmp__` method to allow sorting
    131         `_JobEvent` objects by their times.
    132 
    133         @param other The `_JobEvent` object to compare to `self`.
    134 
    135         """
    136         return self.start_time - other.start_time
    137 
    138 
    139     @property
    140     def job_url(self):
    141         """Return the URL for this event's job logs."""
    142         raise NotImplemented()
    143 
    144 
    145     @property
    146     def diagnosis(self):
    147         """Return the status of the DUT after this event.
    148 
    149         The diagnosis is interpreted as follows:
    150           UNKNOWN - The DUT status was the same before and after
    151               the event.
    152           WORKING - The DUT appeared to be working after the event.
    153           BROKEN - The DUT likely required manual intervention
    154               after the event.
    155 
    156         @return A valid diagnosis value.
    157 
    158         """
    159         raise NotImplemented()
    160 
    161 
    162 class _SpecialTaskEvent(_JobEvent):
    163     """`_JobEvent` adapter for special tasks.
    164 
    165     This class wraps the standard `_JobEvent` interface around a row
    166     in the `afe_special_tasks` table.
    167 
    168     """
    169 
    170     @classmethod
    171     def get_tasks(cls, afe, host_id, start_time, end_time):
    172         """Return special tasks for a host in a given time range.
    173 
    174         Return a list of `_SpecialTaskEvent` objects representing all
    175         special tasks that ran on the given host in the given time
    176         range.  The list is ordered as it was returned by the query
    177         (i.e. unordered).
    178 
    179         @param afe         Autotest frontend
    180         @param host_id     Database host id of the desired host.
    181         @param start_time  Start time of the range of interest.
    182         @param end_time    End time of the range of interest.
    183 
    184         @return A list of `_SpecialTaskEvent` objects.
    185 
    186         """
    187         query_start = time_utils.epoch_time_to_date_string(start_time)
    188         query_end = time_utils.epoch_time_to_date_string(end_time)
    189         tasks = afe.get_host_special_tasks(
    190                 host_id,
    191                 time_started__gte=query_start,
    192                 time_finished__lte=query_end,
    193                 is_complete=1)
    194         return [cls(afe.server, t) for t in tasks]
    195 
    196 
    197     @classmethod
    198     def get_status_task(cls, afe, host_id, end_time):
    199         """Return the task indicating a host's status at a given time.
    200 
    201         The task returned determines the status of the DUT; the
    202         diagnosis on the task indicates the diagnosis for the DUT at
    203         the given `end_time`.
    204 
    205         @param afe         Autotest frontend
    206         @param host_id     Database host id of the desired host.
    207         @param end_time    Find status as of this time.
    208 
    209         @return A `_SpecialTaskEvent` object for the requested task,
    210                 or `None` if no task was found.
    211 
    212         """
    213         query_end = time_utils.epoch_time_to_date_string(end_time)
    214         task = afe.get_host_status_task(host_id, query_end)
    215         return cls(afe.server, task) if task else None
    216 
    217 
    218     def __init__(self, afe_hostname, afetask):
    219         self._afe_hostname = afe_hostname
    220         self._afetask = afetask
    221         super(_SpecialTaskEvent, self).__init__(
    222                 afetask.time_started, afetask.time_finished)
    223 
    224 
    225     @property
    226     def job_url(self):
    227         logdir = ('hosts/%s/%s-%s' %
    228                   (self._afetask.host.hostname, self._afetask.id,
    229                    self._afetask.task.lower()))
    230         return _SpecialTaskEvent.get_log_url(self._afe_hostname, logdir)
    231 
    232 
    233     @property
    234     def diagnosis(self):
    235         if self._afetask.success:
    236             return WORKING
    237         elif self._afetask.task == 'Repair':
    238             return BROKEN
    239         else:
    240             return UNKNOWN
    241 
    242 
    243 class _TestJobEvent(_JobEvent):
    244     """`_JobEvent` adapter for regular test jobs.
    245 
    246     This class wraps the standard `_JobEvent` interface around a row
    247     in the `afe_host_queue_entries` table.
    248 
    249     """
    250 
    251     @classmethod
    252     def get_hqes(cls, afe, host_id, start_time, end_time):
    253         """Return HQEs for a host in a given time range.
    254 
    255         Return a list of `_TestJobEvent` objects representing all the
    256         HQEs of all the jobs that ran on the given host in the given
    257         time range.  The list is ordered as it was returned by the
    258         query (i.e. unordered).
    259 
    260         @param afe         Autotest frontend
    261         @param host_id     Database host id of the desired host.
    262         @param start_time  Start time of the range of interest.
    263         @param end_time    End time of the range of interest.
    264 
    265         @return A list of `_TestJobEvent` objects.
    266 
    267         """
    268         query_start = time_utils.epoch_time_to_date_string(start_time)
    269         query_end = time_utils.epoch_time_to_date_string(end_time)
    270         hqelist = afe.get_host_queue_entries(
    271                 host_id=host_id,
    272                 start_time=query_start,
    273                 end_time=query_end,
    274                 complete=1)
    275         return [cls(afe.server, hqe) for hqe in hqelist]
    276 
    277 
    278     def __init__(self, afe_hostname, hqe):
    279         self._afe_hostname = afe_hostname
    280         self._hqe = hqe
    281         super(_TestJobEvent, self).__init__(
    282                 hqe.started_on, hqe.finished_on)
    283 
    284 
    285     @property
    286     def job_url(self):
    287         logdir = '%s-%s' % (self._hqe.job.id, self._hqe.job.owner)
    288         return _TestJobEvent.get_log_url(self._afe_hostname, logdir)
    289 
    290 
    291     @property
    292     def diagnosis(self):
    293         return UNKNOWN
    294 
    295 
    296 class HostJobHistory(object):
    297     """Class to query and remember DUT execution and status history.
    298 
    299     This class is responsible for querying the database to determine
    300     the history of a single DUT in a time interval of interest, and
    301     for remembering the query results for reporting.
    302 
    303     @property hostname    Host name of the DUT.
    304     @property start_time  Start of the requested time interval.
    305                           This field may be `None`.
    306     @property end_time    End of the requested time interval.
    307     @property _afe        Autotest frontend for queries.
    308     @property _host       Database host object for the DUT.
    309     @property _history    A list of jobs and special tasks that
    310                           ran on the DUT in the requested time
    311                           interval, ordered in reverse, from latest
    312                           to earliest.
    313 
    314     @property _status_interval   A list of all the jobs and special
    315                                  tasks that ran on the DUT in the
    316                                  last diagnosis interval prior to
    317                                  `end_time`, ordered from latest to
    318                                  earliest.
    319     @property _status_diagnosis  The DUT's status as of `end_time`.
    320     @property _status_task       The DUT's last status task as of
    321                                  `end_time`.
    322 
    323     """
    324 
    325     @classmethod
    326     def get_host_history(cls, afe, hostname, start_time, end_time):
    327         """Create a `HostJobHistory` instance for a single host.
    328 
    329         Simple factory method to construct host history from a
    330         hostname.  Simply looks up the host in the AFE database, and
    331         passes it to the class constructor.
    332 
    333         @param afe         Autotest frontend
    334         @param hostname    Name of the host.
    335         @param start_time  Start time for the history's time
    336                            interval.
    337         @param end_time    End time for the history's time interval.
    338 
    339         @return A new `HostJobHistory` instance.
    340 
    341         """
    342         afehost = afe.get_hosts(hostname=hostname)[0]
    343         return cls(afe, afehost, start_time, end_time)
    344 
    345 
    346     @classmethod
    347     def get_multiple_histories(cls, afe, start_time, end_time,
    348                                board=None, pool=None):
    349         """Create `HostJobHistory` instances for a set of hosts.
    350 
    351         The set of hosts can be specified as "all hosts of a given
    352         board type", "all hosts in a given pool", or "all hosts
    353         of a given board and pool".
    354 
    355         @param afe         Autotest frontend
    356         @param start_time  Start time for the history's time
    357                            interval.
    358         @param end_time    End time for the history's time interval.
    359         @param board       All hosts must have this board type; if
    360                            `None`, all boards are allowed.
    361         @param pool        All hosts must be in this pool; if
    362                            `None`, all pools are allowed.
    363 
    364         @return A list of new `HostJobHistory` instances.
    365 
    366         """
    367         # If `board` or `pool` are both `None`, we could search the
    368         # entire database, which is more expensive than we want.
    369         # Our caller currently won't (can't) do this, but assert to
    370         # be safe.
    371         assert board is not None or pool is not None
    372         labels = []
    373         if board is not None:
    374             labels.append(constants.Labels.BOARD_PREFIX + board)
    375         if pool is not None:
    376             labels.append(constants.Labels.POOL_PREFIX + pool)
    377         kwargs = {'multiple_labels': labels}
    378         hosts = afe.get_hosts(**kwargs)
    379         return [cls(afe, h, start_time, end_time) for h in hosts]
    380 
    381 
    382     def __init__(self, afe, afehost, start_time, end_time):
    383         self._afe = afe
    384         self.hostname = afehost.hostname
    385         self.end_time = end_time
    386         self.start_time = start_time
    387         self._host = afehost
    388         # Don't spend time on queries until they're needed.
    389         self._history = None
    390         self._status_interval = None
    391         self._status_diagnosis = None
    392         self._status_task = None
    393 
    394 
    395     def _get_history(self, start_time, end_time):
    396         """Get the list of events for the given interval."""
    397         newtasks = _SpecialTaskEvent.get_tasks(
    398                 self._afe, self._host.id, start_time, end_time)
    399         newhqes = _TestJobEvent.get_hqes(
    400                 self._afe, self._host.id, start_time, end_time)
    401         newhistory = newtasks + newhqes
    402         newhistory.sort(reverse=True)
    403         return newhistory
    404 
    405 
    406     def __iter__(self):
    407         if self._history is None:
    408             self._history = self._get_history(self.start_time,
    409                                               self.end_time)
    410         return self._history.__iter__()
    411 
    412 
    413     def _extract_prefixed_label(self, prefix):
    414         labels = [l for l in self._host.labels
    415                     if l.startswith(prefix)]
    416         return labels[0][len(prefix) : ] if labels else None
    417 
    418 
    419     @property
    420     def host(self):
    421         """Return the AFE host object for this history."""
    422         return self._host
    423 
    424 
    425     @property
    426     def host_board(self):
    427         """Return the board name for this history's DUT."""
    428         prefix = constants.Labels.BOARD_PREFIX
    429         return self._extract_prefixed_label(prefix)
    430 
    431 
    432     @property
    433     def host_pool(self):
    434         """Return the pool name for this history's DUT."""
    435         prefix = constants.Labels.POOL_PREFIX
    436         return self._extract_prefixed_label(prefix)
    437 
    438 
    439     def _init_status_task(self):
    440         """Fill in `self._status_diagnosis` and `_status_task`."""
    441         if self._status_diagnosis is not None:
    442             return
    443         self._status_task = _SpecialTaskEvent.get_status_task(
    444                 self._afe, self._host.id, self.end_time)
    445         if self._status_task is not None:
    446             self._status_diagnosis = self._status_task.diagnosis
    447         else:
    448             self._status_diagnosis = UNKNOWN
    449 
    450 
    451     def _init_status_interval(self):
    452         """Fill in `self._status_interval`."""
    453         if self._status_interval is not None:
    454             return
    455         self._init_status_task()
    456         self._status_interval = []
    457         if self._status_task is None:
    458             return
    459         query_end = time_utils.epoch_time_to_date_string(self.end_time)
    460         interval = self._afe.get_host_diagnosis_interval(
    461                 self._host.id, query_end,
    462                 self._status_diagnosis != WORKING)
    463         if not interval:
    464             return
    465         self._status_interval = self._get_history(
    466                 parse_time(interval[0]),
    467                 parse_time(interval[1]))
    468 
    469 
    470     def diagnosis_interval(self):
    471         """Find this history's most recent diagnosis interval.
    472 
    473         Returns a list of `_JobEvent` instances corresponding to the
    474         most recent diagnosis interval occurring before this
    475         history's end time.
    476 
    477         The list is returned as with `self._history`, ordered from
    478         most to least recent.
    479 
    480         @return The list of the `_JobEvent`s in the diagnosis
    481                 interval.
    482 
    483         """
    484         self._init_status_interval()
    485         return self._status_interval
    486 
    487 
    488     def last_diagnosis(self):
    489         """Return the diagnosis of whether the DUT is working.
    490 
    491         This searches the DUT's job history, looking for the most
    492         recent status task for the DUT.  Return a tuple of
    493         `(diagnosis, task)`.
    494 
    495         The `diagnosis` entry in the tuple is one of these values:
    496           * UNUSED - The host's last status task is older than
    497               `self.start_time`.
    498           * WORKING - The DUT is working.
    499           * BROKEN - The DUT likely requires manual intervention.
    500           * UNKNOWN - No task could be found indicating status for
    501               the DUT.
    502 
    503         If the DUT was working at last check, but hasn't been used
    504         inside this history's time interval, the status `UNUSED` is
    505         returned with the last status task, instead of `WORKING`.
    506 
    507         The `task` entry in the tuple is the status task that led to
    508         the diagnosis.  The task will be `None` if the diagnosis is
    509         `UNKNOWN`.
    510 
    511         @return A tuple with the DUT's diagnosis and the task that
    512                 determined it.
    513 
    514         """
    515         self._init_status_task()
    516         diagnosis = self._status_diagnosis
    517         if (self.start_time is not None and
    518                 self._status_task is not None and
    519                 self._status_task.end_time < self.start_time and
    520                 diagnosis == WORKING):
    521             diagnosis = UNUSED
    522         return diagnosis, self._status_task
    523 
    524 
    525 def get_diagnosis_interval(host_id, end_time, success):
    526     """Return the last diagnosis interval for a given host and time.
    527 
    528     This routine queries the database for the special tasks on a
    529     given host before a given time.  From those tasks it selects the
    530     last status task before a change in status, and the first status
    531     task after the change.  When `success` is true, the change must
    532     be from "working" to "broken".  When false, the search is for a
    533     change in the opposite direction.
    534 
    535     A "successful status task" is any successful special task.  A
    536     "failed status task" is a failed Repair task.  These criteria
    537     are based on the definition of "status task" in the module-level
    538     docstring, above.
    539 
    540     This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
    541 
    542     @param host_id     Database host id of the desired host.
    543     @param end_time    Find the last eligible interval before this time.
    544     @param success     Whether the eligible interval should start with a
    545                        success or a failure.
    546 
    547     @return A list containing the start time of the earliest job
    548             selected, and the end time of the latest job.
    549 
    550     """
    551     base_query = afe_models.SpecialTask.objects.filter(
    552             host_id=host_id, is_complete=True)
    553     success_query = base_query.filter(success=True)
    554     failure_query = base_query.filter(success=False, task='Repair')
    555     if success:
    556         query0 = success_query
    557         query1 = failure_query
    558     else:
    559         query0 = failure_query
    560         query1 = success_query
    561     query0 = query0.filter(time_finished__lte=end_time)
    562     query0 = query0.order_by('time_started').reverse()
    563     if not query0:
    564         return []
    565     task0 = query0[0]
    566     query1 = query1.filter(time_finished__gt=task0.time_finished)
    567     task1 = query1.order_by('time_started')[0]
    568     return [task0.time_started.strftime(time_utils.TIME_FMT),
    569             task1.time_finished.strftime(time_utils.TIME_FMT)]
    570 
    571 
    572 def get_status_task(host_id, end_time):
    573     """Get the last status task for a host before a given time.
    574 
    575     This routine returns a Django query for the AFE database to find
    576     the last task that finished on the given host before the given
    577     time that was either a successful task, or a Repair task.  The
    578     query criteria are based on the definition of "status task" in
    579     the module-level docstring, above.
    580 
    581     This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
    582 
    583     @param host_id     Database host id of the desired host.
    584     @param end_time    End time of the range of interest.
    585 
    586     @return A Django query-set selecting the single special task of
    587             interest.
    588 
    589     """
    590     # Selects status tasks:  any Repair task, or any successful task.
    591     status_tasks = (django_models.Q(task='Repair') |
    592                     django_models.Q(success=True))
    593     # Our caller needs a Django query set in order to serialize the
    594     # result, so we don't resolve the query here; we just return a
    595     # slice with at most one element.
    596     return afe_models.SpecialTask.objects.filter(
    597             status_tasks,
    598             host_id=host_id,
    599             time_finished__lte=end_time,
    600             is_complete=True).order_by('time_started').reverse()[0:1]
    601