Home | History | Annotate | Download | only in lib
      1 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """Services relating to DUT status and job history.
      6 
      7 The central abstraction of this module is the `HostJobHistory`
      8 class.  This class provides two related pieces of information
      9 regarding a single DUT:
     10   * A history of tests and special tasks that have run on
     11     the DUT in a given time range.
     12   * Whether the DUT was "working" or "broken" at a given
     13     time.
     14 
     15 The "working" or "broken" status of a DUT is determined by
     16 the DUT's special task history.  At the end of any job or
     17 task, the status is indicated as follows:
     18   * After any successful special task, the DUT is considered
     19     "working".
     20   * After any failed Repair task, the DUT is considered "broken".
     21   * After any other special task or after any regular test job, the
     22     DUT's status is considered unchanged.
     23 
     24 Definitions for terms used in the code below:
     25   * status task - Any special task that determines the DUT's
     26     status; that is, any successful task, or any failed Repair.
     27   * diagnosis interval - A time interval during which DUT status
     28     changed either from "working" to "broken", or vice versa.  The
     29     interval starts with the last status task with the old status,
     30     and ends after the first status task with the new status.
     31 
     32 Diagnosis intervals are interesting because they normally contain
     33 the logs explaining a failure or repair event.
     34 
     35 """
     36 
     37 import common
     38 import os
     39 from autotest_lib.frontend import setup_django_environment
     40 from django.db import models as django_models
     41 
     42 from autotest_lib.client.common_lib import global_config
     43 from autotest_lib.client.common_lib import utils
     44 from autotest_lib.client.common_lib import time_utils
     45 from autotest_lib.frontend.afe import models as afe_models
     46 from autotest_lib.server import constants
     47 
     48 
     49 # Values used to describe the diagnosis of a DUT.  These values are
     50 # used to indicate both DUT status after a job or task, and also
     51 # diagnosis of whether the DUT was working at the end of a given
     52 # time interval.
     53 #
     54 # UNUSED:  Used when there are no events recorded in a given
     55 #     time interval.
     56 # UNKNOWN:  For an individual event, indicates that the DUT status
     57 #     is unchanged from the previous event.  For a time interval,
     58 #     indicates that the DUT's status can't be determined from the
     59 #     DUT's history.
     60 # WORKING:  Indicates that the DUT was working normally after the
     61 #     event, or at the end of the time interval.
     62 # BROKEN:  Indicates that the DUT needed manual repair after the
     63 #     event, or at the end of the time interval.
     64 #
     65 UNUSED = 0
     66 UNKNOWN = 1
     67 WORKING = 2
     68 BROKEN = 3
     69 
     70 
     71 def parse_time(time_string):
     72     """Parse time according to a canonical form.
     73 
     74     The "canonical" form is the form in which date/time
     75     values are stored in the database.
     76 
     77     @param time_string Time to be parsed.
     78     """
     79     return int(time_utils.to_epoch_time(time_string))
     80 
     81 
     82 class _JobEvent(object):
     83     """Information about an event in host history.
     84 
     85     This remembers the relevant data from a single event in host
     86     history.  An event is any change in DUT state caused by a job
     87     or special task.  The data captured are the start and end times
     88     of the event, the URL of logs to the job or task causing the
     89     event, and a diagnosis of whether the DUT was working or failed
     90     afterwards.
     91 
     92     This class is an adapter around the database model objects
     93     describing jobs and special tasks.  This is an abstract
     94     superclass, with concrete subclasses for `HostQueueEntry` and
     95     `SpecialTask` objects.
     96 
     97     @property start_time  Time the job or task began execution.
     98     @property end_time    Time the job or task finished execution.
     99     @property id          id of the event in the AFE database.
    100     @property name        Name of the event, derived from the AFE database.
    101     @property job_status  Short string describing the event's final status.
    102     @property logdir      Relative path to the logs for the event's job.
    103     @property job_url     URL to the logs for the event's job.
    104     @property gs_url      GS URL to the logs for the event's job.
    105     @property job_id      id of the AFE job for HQEs.  None otherwise.
    106     @property diagnosis   Working status of the DUT after the event.
    107     @property is_special  Boolean indicating if the event is a special task.
    108 
    109     """
    110 
    111     get_config_value = global_config.global_config.get_config_value
    112     _LOG_URL_PATTERN = ('%s/browse/chromeos-autotest-results/%%s/'
    113                         % get_config_value('AUTOTEST_WEB', 'stainless_url',
    114                                            default=None))
    115 
    116     @classmethod
    117     def get_gs_url(cls, logdir):
    118         """Return a GS URL to job results.
    119 
    120         The URL is constructed from a base URL determined by the
    121         global config, plus the relative path of the job's log
    122         directory.
    123 
    124         @param logdir Relative path of the results log directory.
    125 
    126         @return A URL to the requested results log.
    127 
    128         """
    129         return os.path.join(utils.get_offload_gsuri(), logdir)
    130 
    131 
    132     def __init__(self, start_time, end_time):
    133         self.start_time = parse_time(start_time)
    134         self.end_time = parse_time(end_time)
    135 
    136 
    137     def __cmp__(self, other):
    138         """Compare two jobs by their start time.
    139 
    140         This is a standard Python `__cmp__` method to allow sorting
    141         `_JobEvent` objects by their times.
    142 
    143         @param other The `_JobEvent` object to compare to `self`.
    144 
    145         """
    146         return self.start_time - other.start_time
    147 
    148 
    149     @property
    150     def id(self):
    151         """Return the id of the event in the AFE database."""
    152         raise NotImplementedError()
    153 
    154 
    155     @property
    156     def name(self):
    157         """Return the name of the event."""
    158         raise NotImplementedError()
    159 
    160 
    161     @property
    162     def job_status(self):
    163         """Return a short string describing the event's final status."""
    164         raise NotImplementedError()
    165 
    166 
    167     @property
    168     def logdir(self):
    169         """Return the relative path for this event's job logs."""
    170         raise NotImplementedError()
    171 
    172 
    173     @property
    174     def job_url(self):
    175         """Return the URL for this event's job logs."""
    176         return self._LOG_URL_PATTERN % self.logdir
    177 
    178 
    179     @property
    180     def gs_url(self):
    181         """Return the GS URL for this event's job logs."""
    182         return self.get_gs_url(self.logdir)
    183 
    184 
    185     @property
    186     def job_id(self):
    187         """Return the id of the AFE job for HQEs.  None otherwise."""
    188         raise NotImplementedError()
    189 
    190 
    191     @property
    192     def diagnosis(self):
    193         """Return the status of the DUT after this event.
    194 
    195         The diagnosis is interpreted as follows:
    196           UNKNOWN - The DUT status was the same before and after
    197               the event.
    198           WORKING - The DUT appeared to be working after the event.
    199           BROKEN - The DUT likely required manual intervention
    200               after the event.
    201 
    202         @return A valid diagnosis value.
    203 
    204         """
    205         raise NotImplementedError()
    206 
    207 
    208     @property
    209     def is_special(self):
    210         """Return if the event is for a special task."""
    211         raise NotImplementedError()
    212 
    213 
    214 class _SpecialTaskEvent(_JobEvent):
    215     """`_JobEvent` adapter for special tasks.
    216 
    217     This class wraps the standard `_JobEvent` interface around a row
    218     in the `afe_special_tasks` table.
    219 
    220     """
    221 
    222     @classmethod
    223     def get_tasks(cls, afe, host_id, start_time, end_time):
    224         """Return special tasks for a host in a given time range.
    225 
    226         Return a list of `_SpecialTaskEvent` objects representing all
    227         special tasks that ran on the given host in the given time
    228         range.  The list is ordered as it was returned by the query
    229         (i.e. unordered).
    230 
    231         @param afe         Autotest frontend
    232         @param host_id     Database host id of the desired host.
    233         @param start_time  Start time of the range of interest.
    234         @param end_time    End time of the range of interest.
    235 
    236         @return A list of `_SpecialTaskEvent` objects.
    237 
    238         """
    239         query_start = time_utils.epoch_time_to_date_string(start_time)
    240         query_end = time_utils.epoch_time_to_date_string(end_time)
    241         tasks = afe.get_host_special_tasks(
    242                 host_id,
    243                 time_started__gte=query_start,
    244                 time_finished__lte=query_end,
    245                 is_complete=1)
    246         return [cls(t) for t in tasks]
    247 
    248 
    249     @classmethod
    250     def get_status_task(cls, afe, host_id, end_time):
    251         """Return the task indicating a host's status at a given time.
    252 
    253         The task returned determines the status of the DUT; the
    254         diagnosis on the task indicates the diagnosis for the DUT at
    255         the given `end_time`.
    256 
    257         @param afe         Autotest frontend
    258         @param host_id     Database host id of the desired host.
    259         @param end_time    Find status as of this time.
    260 
    261         @return A `_SpecialTaskEvent` object for the requested task,
    262                 or `None` if no task was found.
    263 
    264         """
    265         query_end = time_utils.epoch_time_to_date_string(end_time)
    266         task = afe.get_host_status_task(host_id, query_end)
    267         return cls(task) if task else None
    268 
    269 
    270     def __init__(self, afetask):
    271         self._afetask = afetask
    272         super(_SpecialTaskEvent, self).__init__(
    273                 afetask.time_started, afetask.time_finished)
    274 
    275 
    276     @property
    277     def id(self):
    278         return self._afetask.id
    279 
    280 
    281     @property
    282     def name(self):
    283         return self._afetask.task
    284 
    285 
    286     @property
    287     def job_status(self):
    288         if self._afetask.is_aborted:
    289             return 'ABORTED'
    290         elif self._afetask.success:
    291             return 'PASS'
    292         else:
    293             return 'FAIL'
    294 
    295 
    296     @property
    297     def logdir(self):
    298         return ('hosts/%s/%s-%s' %
    299                 (self._afetask.host.hostname, self._afetask.id,
    300                  self._afetask.task.lower()))
    301 
    302 
    303     @property
    304     def job_id(self):
    305         return None
    306 
    307 
    308     @property
    309     def diagnosis(self):
    310         if self._afetask.success:
    311             return WORKING
    312         elif self._afetask.task == 'Repair':
    313             return BROKEN
    314         else:
    315             return UNKNOWN
    316 
    317 
    318     @property
    319     def is_special(self):
    320         return True
    321 
    322 
    323 class _TestJobEvent(_JobEvent):
    324     """`_JobEvent` adapter for regular test jobs.
    325 
    326     This class wraps the standard `_JobEvent` interface around a row
    327     in the `afe_host_queue_entries` table.
    328 
    329     """
    330 
    331     @classmethod
    332     def get_hqes(cls, afe, host_id, start_time, end_time):
    333         """Return HQEs for a host in a given time range.
    334 
    335         Return a list of `_TestJobEvent` objects representing all the
    336         HQEs of all the jobs that ran on the given host in the given
    337         time range.  The list is ordered as it was returned by the
    338         query (i.e. unordered).
    339 
    340         @param afe         Autotest frontend
    341         @param host_id     Database host id of the desired host.
    342         @param start_time  Start time of the range of interest.
    343         @param end_time    End time of the range of interest.
    344 
    345         @return A list of `_TestJobEvent` objects.
    346 
    347         """
    348         query_start = time_utils.epoch_time_to_date_string(start_time)
    349         query_end = time_utils.epoch_time_to_date_string(end_time)
    350         hqelist = afe.get_host_queue_entries_by_insert_time(
    351                 host_id=host_id,
    352                 insert_time_after=query_start,
    353                 insert_time_before=query_end,
    354                 started_on__gte=query_start,
    355                 started_on__lte=query_end,
    356                 complete=1)
    357         return [cls(hqe) for hqe in hqelist]
    358 
    359 
    360     def __init__(self, hqe):
    361         self._hqe = hqe
    362         super(_TestJobEvent, self).__init__(
    363                 hqe.started_on, hqe.finished_on)
    364 
    365 
    366     @property
    367     def id(self):
    368         return self._hqe.id
    369 
    370 
    371     @property
    372     def name(self):
    373         return self._hqe.job.name
    374 
    375 
    376     @property
    377     def job_status(self):
    378         return self._hqe.status
    379 
    380 
    381     @property
    382     def logdir(self):
    383         return _get_job_logdir(self._hqe.job)
    384 
    385 
    386     @property
    387     def job_id(self):
    388         return self._hqe.job.id
    389 
    390 
    391     @property
    392     def diagnosis(self):
    393         return UNKNOWN
    394 
    395 
    396     @property
    397     def is_special(self):
    398         return False
    399 
    400 
    401 class HostJobHistory(object):
    402     """Class to query and remember DUT execution and status history.
    403 
    404     This class is responsible for querying the database to determine
    405     the history of a single DUT in a time interval of interest, and
    406     for remembering the query results for reporting.
    407 
    408     @property hostname    Host name of the DUT.
    409     @property start_time  Start of the requested time interval, as a unix
    410                           timestamp (epoch time).
    411                           This field may be `None`.
    412     @property end_time    End of the requested time interval, as a unix
    413                           timestamp (epoch time).
    414     @property _afe        Autotest frontend for queries.
    415     @property _host       Database host object for the DUT.
    416     @property _history    A list of jobs and special tasks that
    417                           ran on the DUT in the requested time
    418                           interval, ordered in reverse, from latest
    419                           to earliest.
    420 
    421     @property _status_interval   A list of all the jobs and special
    422                                  tasks that ran on the DUT in the
    423                                  last diagnosis interval prior to
    424                                  `end_time`, ordered from latest to
    425                                  earliest.
    426     @property _status_diagnosis  The DUT's status as of `end_time`.
    427     @property _status_task       The DUT's last status task as of
    428                                  `end_time`.
    429 
    430     """
    431 
    432     @classmethod
    433     def get_host_history(cls, afe, hostname, start_time, end_time):
    434         """Create a `HostJobHistory` instance for a single host.
    435 
    436         Simple factory method to construct host history from a
    437         hostname.  Simply looks up the host in the AFE database, and
    438         passes it to the class constructor.
    439 
    440         @param afe         Autotest frontend
    441         @param hostname    Name of the host.
    442         @param start_time  Start time for the history's time
    443                            interval.
    444         @param end_time    End time for the history's time interval.
    445 
    446         @return A new `HostJobHistory` instance.
    447 
    448         """
    449         afehost = afe.get_hosts(hostname=hostname)[0]
    450         return cls(afe, afehost, start_time, end_time)
    451 
    452 
    453     @classmethod
    454     def get_multiple_histories(cls, afe, start_time, end_time, labels=()):
    455         """Create `HostJobHistory` instances for a set of hosts.
    456 
    457         @param afe         Autotest frontend
    458         @param start_time  Start time for the history's time
    459                            interval.
    460         @param end_time    End time for the history's time interval.
    461         @param labels      type: [str]. AFE labels to constrain the host query.
    462                            This option must be non-empty. An unconstrained
    463                            search of the DB is too costly.
    464 
    465         @return A list of new `HostJobHistory` instances.
    466 
    467         """
    468         assert labels, (
    469             'Must specify labels for get_multiple_histories. '
    470             'Unconstrainted search of the database is prohibitively costly.')
    471 
    472         kwargs = {'multiple_labels': labels}
    473         hosts = afe.get_hosts(**kwargs)
    474         return [cls(afe, h, start_time, end_time) for h in hosts]
    475 
    476 
    477     def __init__(self, afe, afehost, start_time, end_time):
    478         self._afe = afe
    479         self.hostname = afehost.hostname
    480         self.end_time = end_time
    481         self.start_time = start_time
    482         self._host = afehost
    483         # Don't spend time on queries until they're needed.
    484         self._history = None
    485         self._status_interval = None
    486         self._status_diagnosis = None
    487         self._status_task = None
    488 
    489 
    490     def _get_history(self, start_time, end_time):
    491         """Get the list of events for the given interval."""
    492         newtasks = _SpecialTaskEvent.get_tasks(
    493                 self._afe, self._host.id, start_time, end_time)
    494         newhqes = _TestJobEvent.get_hqes(
    495                 self._afe, self._host.id, start_time, end_time)
    496         newhistory = newtasks + newhqes
    497         newhistory.sort(reverse=True)
    498         return newhistory
    499 
    500 
    501     def __iter__(self):
    502         if self._history is None:
    503             self._history = self._get_history(self.start_time,
    504                                               self.end_time)
    505         return self._history.__iter__()
    506 
    507 
    508     def _extract_prefixed_label(self, prefix):
    509         labels = [l for l in self._host.labels
    510                     if l.startswith(prefix)]
    511         return labels[0][len(prefix) : ] if labels else None
    512 
    513 
    514     @property
    515     def host(self):
    516         """Return the AFE host object for this history."""
    517         return self._host
    518 
    519 
    520     @property
    521     def host_model(self):
    522         """Return the model name for this history's DUT."""
    523         prefix = constants.Labels.MODEL_PREFIX
    524         return self._extract_prefixed_label(prefix)
    525 
    526 
    527     @property
    528     def host_board(self):
    529         """Return the board name for this history's DUT."""
    530         prefix = constants.Labels.BOARD_PREFIX
    531         return self._extract_prefixed_label(prefix)
    532 
    533 
    534     @property
    535     def host_pool(self):
    536         """Return the pool name for this history's DUT."""
    537         prefix = constants.Labels.POOL_PREFIX
    538         return self._extract_prefixed_label(prefix)
    539 
    540 
    541     def _init_status_task(self):
    542         """Fill in `self._status_diagnosis` and `_status_task`."""
    543         if self._status_diagnosis is not None:
    544             return
    545         self._status_task = _SpecialTaskEvent.get_status_task(
    546                 self._afe, self._host.id, self.end_time)
    547         if self._status_task is not None:
    548             self._status_diagnosis = self._status_task.diagnosis
    549         else:
    550             self._status_diagnosis = UNKNOWN
    551 
    552 
    553     def _init_status_interval(self):
    554         """Fill in `self._status_interval`."""
    555         if self._status_interval is not None:
    556             return
    557         self._init_status_task()
    558         self._status_interval = []
    559         if self._status_task is None:
    560             return
    561         query_end = time_utils.epoch_time_to_date_string(self.end_time)
    562         interval = self._afe.get_host_diagnosis_interval(
    563                 self._host.id, query_end,
    564                 self._status_diagnosis != WORKING)
    565         if not interval:
    566             return
    567         self._status_interval = self._get_history(
    568                 parse_time(interval[0]),
    569                 parse_time(interval[1]))
    570 
    571 
    572     def diagnosis_interval(self):
    573         """Find this history's most recent diagnosis interval.
    574 
    575         Returns a list of `_JobEvent` instances corresponding to the
    576         most recent diagnosis interval occurring before this
    577         history's end time.
    578 
    579         The list is returned as with `self._history`, ordered from
    580         most to least recent.
    581 
    582         @return The list of the `_JobEvent`s in the diagnosis
    583                 interval.
    584 
    585         """
    586         self._init_status_interval()
    587         return self._status_interval
    588 
    589 
    590     def last_diagnosis(self):
    591         """Return the diagnosis of whether the DUT is working.
    592 
    593         This searches the DUT's job history, looking for the most
    594         recent status task for the DUT.  Return a tuple of
    595         `(diagnosis, task)`.
    596 
    597         The `diagnosis` entry in the tuple is one of these values:
    598           * UNUSED - The host's last status task is older than
    599               `self.start_time`.
    600           * WORKING - The DUT is working.
    601           * BROKEN - The DUT likely requires manual intervention.
    602           * UNKNOWN - No task could be found indicating status for
    603               the DUT.
    604 
    605         If the DUT was working at last check, but hasn't been used
    606         inside this history's time interval, the status `UNUSED` is
    607         returned with the last status task, instead of `WORKING`.
    608 
    609         The `task` entry in the tuple is the status task that led to
    610         the diagnosis.  The task will be `None` if the diagnosis is
    611         `UNKNOWN`.
    612 
    613         @return A tuple with the DUT's diagnosis and the task that
    614                 determined it.
    615 
    616         """
    617         self._init_status_task()
    618         diagnosis = self._status_diagnosis
    619         if (self.start_time is not None and
    620                 self._status_task is not None and
    621                 self._status_task.end_time < self.start_time and
    622                 diagnosis == WORKING):
    623             diagnosis = UNUSED
    624         return diagnosis, self._status_task
    625 
    626 
    627 def get_diagnosis_interval(host_id, end_time, success):
    628     """Return the last diagnosis interval for a given host and time.
    629 
    630     This routine queries the database for the special tasks on a
    631     given host before a given time.  From those tasks it selects the
    632     last status task before a change in status, and the first status
    633     task after the change.  When `success` is true, the change must
    634     be from "working" to "broken".  When false, the search is for a
    635     change in the opposite direction.
    636 
    637     A "successful status task" is any successful special task.  A
    638     "failed status task" is a failed Repair task.  These criteria
    639     are based on the definition of "status task" in the module-level
    640     docstring, above.
    641 
    642     This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
    643 
    644     @param host_id     Database host id of the desired host.
    645     @param end_time    Find the last eligible interval before this time.
    646     @param success     Whether the eligible interval should start with a
    647                        success or a failure.
    648 
    649     @return A list containing the start time of the earliest job
    650             selected, and the end time of the latest job.
    651 
    652     """
    653     base_query = afe_models.SpecialTask.objects.filter(
    654             host_id=host_id, is_complete=True)
    655     success_query = base_query.filter(success=True)
    656     failure_query = base_query.filter(success=False, task='Repair')
    657     if success:
    658         query0 = success_query
    659         query1 = failure_query
    660     else:
    661         query0 = failure_query
    662         query1 = success_query
    663     query0 = query0.filter(time_finished__lte=end_time)
    664     query0 = query0.order_by('time_started').reverse()
    665     if not query0:
    666         return []
    667     task0 = query0[0]
    668     query1 = query1.filter(time_finished__gt=task0.time_finished)
    669     task1 = query1.order_by('time_started')[0]
    670     return [task0.time_started.strftime(time_utils.TIME_FMT),
    671             task1.time_finished.strftime(time_utils.TIME_FMT)]
    672 
    673 
    674 def get_status_task(host_id, end_time):
    675     """Get the last status task for a host before a given time.
    676 
    677     This routine returns a Django query for the AFE database to find
    678     the last task that finished on the given host before the given
    679     time that was either a successful task, or a Repair task.  The
    680     query criteria are based on the definition of "status task" in
    681     the module-level docstring, above.
    682 
    683     This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
    684 
    685     @param host_id     Database host id of the desired host.
    686     @param end_time    End time of the range of interest.
    687 
    688     @return A Django query-set selecting the single special task of
    689             interest.
    690 
    691     """
    692     # Selects status tasks:  any Repair task, or any successful task.
    693     status_tasks = (django_models.Q(task='Repair') |
    694                     django_models.Q(success=True))
    695     # Our caller needs a Django query set in order to serialize the
    696     # result, so we don't resolve the query here; we just return a
    697     # slice with at most one element.
    698     return afe_models.SpecialTask.objects.filter(
    699             status_tasks,
    700             host_id=host_id,
    701             time_finished__lte=end_time,
    702             is_complete=True).order_by('time_started').reverse()[0:1]
    703 
    704 
    705 def _get_job_logdir(job):
    706     """Gets the logdir for an AFE job.
    707 
    708     @param job Job object which has id and owner properties.
    709 
    710     @return Relative path of the results log directory.
    711     """
    712     return '%s-%s' % (job.id, job.owner)
    713 
    714 
    715 def get_job_gs_url(job):
    716     """Gets the GS URL for an AFE job.
    717 
    718     @param job Job object which has id and owner properties.
    719 
    720     @return Absolute GS URL to the results log directory.
    721     """
    722     return _JobEvent.get_gs_url(_get_job_logdir(job))
    723