Home | History | Annotate | Download | only in hosts
      1 # Copyright 2016 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """
      6 Framework for host verification and repair in Autotest.
      7 
      8 The framework provides implementation code in support of `Host.verify()`
      9 and `Host.repair()` used in Verify and Repair special tasks.
     10 
     11 The framework consists of these classes:
     12   * `Verifier`: A class representing a single verification check.
     13   * `RepairAction`: A class representing a repair operation that can fix
     14     a failed verification check.
     15   * `RepairStrategy`:  A class for organizing a collection of `Verifier`
     16     and `RepairAction` instances, and invoking them in order.
     17 
     18 Individual operations during verification and repair are handled by
     19 instances of `Verifier` and `RepairAction`.  `Verifier` objects are
     20 meant to test for specific conditions that may cause tests to fail.
     21 `RepairAction` objects provide operations designed to fix one or
     22 more failures identified by a `Verifier` object.
     23 """
     24 
     25 import collections
     26 import logging
     27 
     28 import common
     29 from autotest_lib.client.common_lib import error
     30 
     31 
     32 class AutoservVerifyError(error.AutoservError):
     33     """
     34     Generic Exception for failures from `Verifier` objects.
     35 
     36     Instances of this exception can be raised when a `verify()`
     37     method fails, if no more specific exception is available.
     38     """
     39     pass
     40 
     41 
     42 _DependencyFailure = collections.namedtuple(
     43         '_DependencyFailure', ('dependency', 'error'))
     44 
     45 
     46 class AutoservVerifyDependencyError(error.AutoservError):
     47     """
     48     Exception raised for failures in dependencies.
     49 
     50     This exception is used to distinguish an original failure from a
     51     failure being passed back from a verification dependency.  That is,
     52     if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
     53     to signal that the original failure is further down the dependency
     54     chain.
     55 
     56     The `failures` argument to the constructor for this class is a set
     57     of instances of `_DependencyFailure`, each corresponding to one
     58     failed dependency:
     59       * The `dependency` attribute of each failure is the description
     60         of the failed dependency.
     61       * The `error` attribute of each failure is the string value of
     62         the exception from the failed dependency.
     63 
     64     Multiple methods in this module recognize and handle this exception
     65     specially.
     66 
     67     @property failures  Set of failures passed to the constructor.
     68     @property _node     Instance of `_DependencyNode` reporting the
     69                         failed dependencies.
     70     """
     71     def __init__(self, node, failures):
     72         """
     73         Constructor for `AutoservVerifyDependencyError`.
     74 
     75         @param node       Instance of _DependencyNode reporting the
     76                           failed dependencies.
     77         @param failures   List of failure tuples as described above.
     78         """
     79         super(AutoservVerifyDependencyError, self).__init__(
     80                 '\n'.join([f.error for f in failures]))
     81         self.failures = failures
     82         self._node = node
     83 
     84 
     85     def log_dependencies(self, action, deps):
     86         """
     87         Log an `AutoservVerifyDependencyError`.
     88 
     89         This writes a short summary of the dependency failures captured
     90         in this exception, using standard Python logging.
     91 
     92         The passed in `action` string plus `self._node.description`
     93         are logged at INFO level.  The `action` argument should
     94         introduce or describe an action relative to `self._node`.
     95 
     96         The passed in `deps` string and the description of each failed
     97         dependency in `self` are be logged at DEBUG level.  The `deps`
     98         argument is used to introduce the various failed dependencies.
     99 
    100         @param action   A string mentioning the action being logged
    101                         relative to `self._node`.
    102         @param deps     A string introducing the dependencies that
    103                         failed.
    104         """
    105         logging.info('%s: %s', action, self._node.description)
    106         logging.debug('%s:', deps)
    107         for failure in self.failures:
    108             logging.debug('    %s', failure.dependency)
    109 
    110 
    111 class AutoservRepairError(error.AutoservError):
    112     """
    113     Generic Exception for failures from `RepairAction` objects.
    114 
    115     Instances of this exception can be raised when a `repair()`
    116     method fails, if no more specific exception is available.
    117     """
    118     pass
    119 
    120 
    121 class _DependencyNode(object):
    122     """
    123     An object that can depend on verifiers.
    124 
    125     Both repair and verify operations have the notion of dependencies
    126     that must pass before the operation proceeds.  This class captures
    127     the shared behaviors required by both classes.
    128 
    129     @property tag               Short identifier to be used in logging.
    130     @property description       Text summary of this node's action, to be
    131                                 used in debug logs.
    132     @property _dependency_list  Dependency pre-requisites.
    133     """
    134 
    135     def __init__(self, tag, dependencies):
    136         self._dependency_list = dependencies
    137         self._tag = tag
    138 
    139 
    140     def _record(self, host, silent, *record_args):
    141         """
    142         Log a status record for `host`.
    143 
    144         Call `host.record()` with the given `record_args`, unless
    145         requested to skip by `silent`.
    146 
    147         @param host         Host which will record the status record.
    148         @param silent       Don't record the event if this is a true
    149                             value.
    150         @param record_args  Arguments to pass to `host.record()`.
    151         """
    152         if not silent:
    153             host.record(*record_args)
    154 
    155 
    156     def _verify_list(self, host, verifiers, silent):
    157         """
    158         Test a list of verifiers against a given host.
    159 
    160         This invokes `_verify_host()` on every verifier in the given
    161         list.  If any verifier in the transitive closure of dependencies
    162         in the list fails, an `AutoservVerifyDependencyError` is raised
    163         containing the description of each failed verifier.  Only
    164         original failures are reported; verifiers that don't run due
    165         to a failed dependency are omitted.
    166 
    167         By design, original failures are logged once in `_verify_host()`
    168         when `verify()` originally fails.  The additional data gathered
    169         here is for the debug logs to indicate why a subsequent
    170         operation never ran.
    171 
    172         @param host       The host to be tested against the verifiers.
    173         @param verifiers  List of verifiers to be checked.
    174         @param silent     If true, don't log host status records.
    175 
    176         @raises AutoservVerifyDependencyError   Raised when at least
    177                         one verifier in the list has failed.
    178         """
    179         failures = set()
    180         for v in verifiers:
    181             try:
    182                 v._verify_host(host, silent)
    183             except AutoservVerifyDependencyError as e:
    184                 failures.update(e.failures)
    185             except Exception as e:
    186                 failures.add(_DependencyFailure(v.description, str(e)))
    187         if failures:
    188             raise AutoservVerifyDependencyError(self, failures)
    189 
    190 
    191     def _verify_dependencies(self, host, silent):
    192         """
    193         Verify that all of this node's dependencies pass for a host.
    194 
    195         @param host     The host to be verified.
    196         @param silent   If true, don't log host status records.
    197         """
    198         try:
    199             self._verify_list(host, self._dependency_list, silent)
    200         except AutoservVerifyDependencyError as e:
    201             e.log_dependencies(
    202                     'Skipping this operation',
    203                     'The following dependencies failed')
    204             raise
    205 
    206 
    207     @property
    208     def tag(self):
    209         """
    210         Tag for use in logging status records.
    211 
    212         This is a property with a short string used to identify the node
    213         in the 'status.log' file and during node construction.  The tag
    214         should contain only letters, digits, and '_' characters.  This
    215         tag is not used alone, but is combined with other identifiers,
    216         based on the operation being logged.
    217 
    218         @return A short identifier-like string.
    219         """
    220         return self._tag
    221 
    222 
    223     @property
    224     def description(self):
    225         """
    226         Text description of this node for log messages.
    227 
    228         This string will be logged with failures, and should describe
    229         the condition required for success.
    230 
    231         N.B. Subclasses are required to override this method, but we
    232         _don't_ raise NotImplementedError here.  Various methods fail in
    233         inscrutable ways if this method raises any exception, so for
    234         debugging purposes, it's better to return a default value.
    235 
    236         @return A descriptive string.
    237         """
    238         return ('Class %s fails to implement description().' %
    239                 type(self).__name__)
    240 
    241 
    242 class Verifier(_DependencyNode):
    243     """
    244     Abstract class embodying one verification check.
    245 
    246     A concrete subclass of `Verifier` provides a simple check that can
    247     determine a host's fitness for testing.  Failure indicates that the
    248     check found a problem that can cause at least one test to fail.
    249 
    250     `Verifier` objects are organized in a DAG identifying dependencies
    251     among operations.  The DAG controls ordering and prevents wasted
    252     effort:  If verification operation V2 requires that verification
    253     operation V1 pass, then a) V1 will run before V2, and b) if V1
    254     fails, V2 won't run at all.  The `_verify_host()` method ensures
    255     that all dependencies run and pass before invoking the `verify()`
    256     method.
    257 
    258     A `Verifier` object caches its result the first time it calls
    259     `verify()`.  Subsequent calls return the cached result, without
    260     re-running the check code.  The `_reverify()` method clears the
    261     cached result in the current node, and in all dependencies.
    262 
    263     Subclasses must supply these properties and methods:
    264       * `verify()`: This is the method to perform the actual
    265         verification check.
    266       * `description`:  A one-line summary of the verification check for
    267         debug log messages.
    268 
    269     Subclasses must override all of the above attributes; subclasses
    270     should not override or extend any other attributes of this class.
    271 
    272     The description string should be a simple sentence explaining what
    273     must be true for the verifier to pass.  Do not include a terminating
    274     period.  For example:
    275 
    276         Host is available via ssh
    277 
    278     The base class manages the following private data:
    279       * `_result`:  The cached result of verification.
    280       * `_dependency_list`:  The list of dependencies.
    281     Subclasses should not use these attributes.
    282 
    283     @property _result           Cached result of verification.
    284     """
    285 
    286     def __init__(self, tag, dependencies):
    287         super(Verifier, self).__init__(tag, dependencies)
    288         self._result = None
    289         self._verify_tag = 'verify.' + self.tag
    290 
    291 
    292     def _reverify(self):
    293         """
    294         Discard cached verification results.
    295 
    296         Reset the cached verification result for this node, and for the
    297         transitive closure of all dependencies.
    298         """
    299         if self._result is not None:
    300             self._result = None
    301             for v in self._dependency_list:
    302                 v._reverify()
    303 
    304 
    305     def _verify_host(self, host, silent):
    306         """
    307         Determine the result of verification, and log results.
    308 
    309         If this verifier does not have a cached verification result,
    310         check dependencies, and if they pass, run `verify()`.  Log
    311         informational messages regarding failed dependencies.  If we
    312         call `verify()`, log the result in `status.log`.
    313 
    314         If we already have a cached result, return that result without
    315         logging any message.
    316 
    317         @param host     The host to be tested for a problem.
    318         @param silent   If true, don't log host status records.
    319         """
    320         if self._result is not None:
    321             if isinstance(self._result, Exception):
    322                 raise self._result  # cached failure
    323             elif self._result:
    324                 return              # cached success
    325         self._result = False
    326         self._verify_dependencies(host, silent)
    327         logging.info('Verifying this condition: %s', self.description)
    328         try:
    329             self.verify(host)
    330             self._record(host, silent, 'GOOD', None, self._verify_tag)
    331         except Exception as e:
    332             logging.exception('Failed: %s', self.description)
    333             self._result = e
    334             self._record(host, silent,
    335                          'FAIL', None, self._verify_tag, str(e))
    336             raise
    337         self._result = True
    338 
    339 
    340     def verify(self, host):
    341         """
    342         Unconditionally perform a verification check.
    343 
    344         This method is responsible for testing for a single problem on a
    345         host.  Implementations should follow these guidelines:
    346           * The check should find a problem that will cause testing to
    347             fail.
    348           * Verification checks on a working system should run quickly
    349             and should be optimized for success; a check that passes
    350             should finish within seconds.
    351           * Verification checks are not expected have side effects, but
    352             may apply trivial fixes if they will finish within the time
    353             constraints above.
    354 
    355         A verification check should normally trigger a single set of
    356         repair actions.  If two different failures can require two
    357         different repairs, ideally they should use two different
    358         subclasses of `Verifier`.
    359 
    360         Implementations indicate failure by raising an exception.  The
    361         exception text should be a short, 1-line summary of the error.
    362         The text should be concise and diagnostic, as it will appear in
    363         `status.log` files.
    364 
    365         If this method finds no problems, it returns without raising any
    366         exception.
    367 
    368         Implementations should avoid most logging actions, but can log
    369         DEBUG level messages if they provide significant information for
    370         diagnosing failures.
    371 
    372         @param host   The host to be tested for a problem.
    373         """
    374         raise NotImplementedError('Class %s does not implement '
    375                                   'verify()' % type(self).__name__)
    376 
    377 
    378 class RepairAction(_DependencyNode):
    379     """
    380     Abstract class embodying one repair procedure.
    381 
    382     A `RepairAction` is responsible for fixing one or more failed
    383     `Verifier` checks, in order to make those checks pass.
    384 
    385     Each repair action includes one or more verifier triggers that
    386     determine when the repair action should run.  A repair action
    387     will call its `repair()` method if one or more of its triggers
    388     fails.  A repair action is successful if all of its triggers pass
    389     after calling `repair()`.
    390 
    391     A `RepairAction` is a subclass of `_DependencyNode`; if any of a
    392     repair action's dependencies fail, the action does not check its
    393     triggers, and doesn't call `repair()`.
    394 
    395     Subclasses must supply these attributes:
    396       * `repair()`: This is the method to perform the necessary
    397         repair.  The method should avoid most logging actions, but
    398         can log DEBUG level messages if they provide significant
    399         information for diagnosing failures.
    400       * `description`:  A one-line summary of the repair action for
    401         debug log messages.
    402 
    403     Subclasses must override both of the above attributes and should
    404     not override any other attributes of this class.
    405 
    406     The description string should be a simple sentence explaining the
    407     operation that will be performed.  Do not include a terminating
    408     period.  For example:
    409 
    410         Re-install the stable build via AU
    411 
    412     @property _trigger_list   List of verification checks that will
    413                               trigger this repair when they fail.
    414     """
    415 
    416     def __init__(self, tag, dependencies, triggers):
    417         super(RepairAction, self).__init__(tag, dependencies)
    418         self._trigger_list = triggers
    419         self._repair_tag = 'repair.' + self.tag
    420 
    421 
    422     def _repair_host(self, host, silent):
    423         """
    424         Apply this repair action if any triggers fail.
    425 
    426         Repair is triggered when all dependencies are successful, and at
    427         least one trigger fails.
    428 
    429         If the `repair()` method triggers, the success or failure of
    430         this operation is logged in `status.log` bracketed by 'START'
    431         and 'END' records.  Details of whether or why `repair()`
    432         triggered are written to the debug logs.   If repair doesn't
    433         trigger, nothing is logged to `status.log`.
    434 
    435         @param host     The host to be repaired.
    436         @param silent   If true, don't log host status records.
    437         """
    438         self._verify_dependencies(host, silent)
    439         try:
    440             self._verify_list(host, self._trigger_list, silent)
    441         except AutoservVerifyDependencyError as e:
    442             e.log_dependencies(
    443                     'Attempting this repair action',
    444                     'Repairing because these triggers failed')
    445             self._record(host, silent, 'START', None, self._repair_tag)
    446             try:
    447                 self.repair(host)
    448             except Exception as e:
    449                 logging.exception('Repair failed: %s', self.description)
    450                 self._record(host, silent,
    451                              'FAIL', None, self._repair_tag, str(e))
    452                 self._record(host, silent,
    453                              'END FAIL', None, self._repair_tag)
    454                 raise
    455             try:
    456                 for v in self._trigger_list:
    457                     v._reverify()
    458                 self._verify_list(host, self._trigger_list, silent)
    459                 self._record(host, silent,
    460                              'END GOOD', None, self._repair_tag)
    461             except AutoservVerifyDependencyError as e:
    462                 e.log_dependencies(
    463                         'This repair action reported success',
    464                         'However, these triggers still fail')
    465                 self._record(host, silent,
    466                              'END FAIL', None, self._repair_tag)
    467                 raise AutoservRepairError(
    468                         'Some verification checks still fail')
    469             except Exception:
    470                 # The specification for `self._verify_list()` says
    471                 # that this can't happen; this is a defensive
    472                 # precaution.
    473                 self._record(host, silent,
    474                              'END FAIL', None, self._repair_tag,
    475                             'Internal error in repair')
    476                 raise
    477         else:
    478             logging.info('No failed triggers, skipping repair:  %s',
    479                          self.description)
    480 
    481 
    482     def repair(self, host):
    483         """
    484         Apply this repair action to the given host.
    485 
    486         This method is responsible for applying changes to fix failures
    487         in one or more verification checks.  The repair is considered
    488         successful if the DUT passes the specific checks after this
    489         method completes.
    490 
    491         Implementations indicate failure by raising an exception.  The
    492         exception text should be a short, 1-line summary of the error.
    493         The text should be concise and diagnostic, as it will appear in
    494         `status.log` files.
    495 
    496         If this method completes successfully, it returns without
    497         raising any exception.
    498 
    499         Implementations should avoid most logging actions, but can log
    500         DEBUG level messages if they provide significant information for
    501         diagnosing failures.
    502 
    503         @param host   The host to be repaired.
    504         """
    505         raise NotImplementedError('Class %s does not implement '
    506                                   'repair()' % type(self).__name__)
    507 
    508 
    509 class _RootVerifier(Verifier):
    510     """
    511     Utility class used by `RepairStrategy`.
    512 
    513     A node of this class by itself does nothing; it always passes (if it
    514     can run).  This class exists merely to be the root of a DAG of
    515     dependencies in an instance of `RepairStrategy`.
    516     """
    517 
    518     def verify(self, host):
    519         pass
    520 
    521 
    522     @property
    523     def description(self):
    524         return 'All host verification checks pass'
    525 
    526 
    527 
    528 class RepairStrategy(object):
    529     """
    530     A class for organizing `Verifier` and `RepairAction` objects.
    531 
    532     An instance of `RepairStrategy` is organized as a DAG of `Verifier`
    533     objects, plus a list of `RepairAction` objects.  The class provides
    534     methods for invoking those objects in the required order, when
    535     needed:
    536       * The `verify()` method walks the verifier DAG in dependency
    537         order.
    538       * The `repair()` method invokes the repair actions in list order.
    539         Each repair action will invoke its dependencies and triggers as
    540         needed.
    541 
    542     # The Verifier DAG
    543     The verifier DAG is constructed from the first argument passed to
    544     the passed to the `RepairStrategy` constructor.  That argument is an
    545     iterable consisting of three-element tuples in the form
    546     `(constructor, tag, deps)`:
    547       * The `constructor` value is a callable that creates a `Verifier`
    548         as for the interface of the class constructor.  For classes
    549         that inherit the default constructor from `Verifier`, this can
    550         be the class itself.
    551       * The `tag` value is the tag to be associated with the constructed
    552         verifier.
    553       * The `deps` value is an iterable (e.g. list or tuple) of strings.
    554         Each string corresponds to the `tag` member of a `Verifier`
    555         dependency.
    556 
    557     The tag names of verifiers in the constructed DAG must all be
    558     unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
    559     reserved and may not be used by any verifier.
    560 
    561     In the input data for the constructor, dependencies must appear
    562     before the nodes that depend on them.  Thus:
    563 
    564         ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
    565         ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
    566 
    567     Internally, the DAG of verifiers is given unique root node.  So,
    568     given this input:
    569 
    570         ((C, 'c', ()),
    571          (A, 'a', ('c',)),
    572          (B, 'b', ('c',)))
    573 
    574     The following DAG is constructed:
    575 
    576           Root
    577           /  \
    578          A    B
    579           \  /
    580            C
    581 
    582     Since nothing depends on `A` or `B`, the root node guarantees that
    583     these two verifiers will both be called and properly logged.
    584 
    585     The root node is not directly accessible; however repair actions can
    586     trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
    587     node will be logged in `status.log` whenever `verify()` succeeds.
    588 
    589     # The Repair Actions List
    590     The list of repair actions is constructed from the second argument
    591     passed to the passed to the `RepairStrategy` constructor.  That
    592     argument is an iterable consisting of four-element tuples in the
    593     form `(constructor, tag, deps, triggers)`:
    594       * The `constructor` value is a callable that creates a
    595         `RepairAction` as for the interface of the class constructor.
    596         For classes that inherit the default constructor from
    597         `RepairAction`, this can be the class itself.
    598       * The `tag` value is the tag to be associated with the constructed
    599         repair action.
    600       * The `deps` value is an iterable (e.g. list or tuple) of strings.
    601         Each string corresponds to the `tag` member of a `Verifier` that
    602         the repair action depends on.
    603       * The `triggers` value is an iterable (e.g. list or tuple) of
    604         strings.  Each string corresponds to the `tag` member of a
    605         `Verifier` that can trigger the repair action.
    606 
    607     `RepairStrategy` deps and triggers can only refer to verifiers,
    608     not to other repair actions.
    609     """
    610 
    611     # This name is reserved; clients may not use it.
    612     ROOT_TAG = 'PASS'
    613 
    614     @staticmethod
    615     def _add_verifier(verifiers, constructor, tag, dep_tags):
    616         """
    617         Construct and remember a verifier.
    618 
    619         Create a `Verifier` using `constructor` and `tag`.  Dependencies
    620         for construction are found by looking up `dep_tags` in the
    621         `verifiers` dictionary.
    622 
    623         After construction, the new verifier is added to `verifiers`.
    624 
    625         @param verifiers    Dictionary of verifiers, indexed by tag.
    626         @param constructor  Verifier construction function.
    627         @param tag          Tag parameter for the construction function.
    628         @param dep_tags     Tags of dependencies for the constructor, to
    629                             be found in `verifiers`.
    630         """
    631         assert tag not in verifiers
    632         deps = [verifiers[d] for d in dep_tags]
    633         verifiers[tag] = constructor(tag, deps)
    634 
    635 
    636     def __init__(self, verifier_data, repair_data):
    637         """
    638         Construct a `RepairStrategy` from simplified DAG data.
    639 
    640         The input `verifier_data` object describes how to construct
    641         verify nodes and the dependencies that relate them, as detailed
    642         above.
    643 
    644         The input `repair_data` object describes how to construct repair
    645         actions and their dependencies and triggers, as detailed above.
    646 
    647         @param verifier_data  Iterable value with constructors for the
    648                               elements of the verification DAG and their
    649                               dependencies.
    650         @param repair_data    Iterable value with constructors for the
    651                               elements of the repair action list, and
    652                               their dependencies and triggers.
    653         """
    654         # We use the `all_verifiers` list to guarantee that our root
    655         # verifier will execute its dependencies in the order provided
    656         # to us by our caller.
    657         verifier_map = {}
    658         all_tags = []
    659         dependencies = set()
    660         for constructor, tag, deps in verifier_data:
    661             self._add_verifier(verifier_map, constructor, tag, deps)
    662             dependencies.update(deps)
    663             all_tags.append(tag)
    664         # Capture all the verifiers that have nothing depending on them.
    665         root_tags = [t for t in all_tags if t not in dependencies]
    666         self._add_verifier(verifier_map, _RootVerifier,
    667                            self.ROOT_TAG, root_tags)
    668         self._verify_root = verifier_map[self.ROOT_TAG]
    669         self._repair_actions = []
    670         for constructor, tag, deps, triggers in repair_data:
    671             r = constructor(tag,
    672                             [verifier_map[d] for d in deps],
    673                             [verifier_map[t] for t in triggers])
    674             self._repair_actions.append(r)
    675 
    676 
    677     def verify(self, host, silent=False):
    678         """
    679         Run the verifier DAG on the given host.
    680 
    681         @param host     The target to be verified.
    682         @param silent   If true, don't log host status records.
    683         """
    684         self._verify_root._reverify()
    685         self._verify_root._verify_host(host, silent)
    686 
    687 
    688     def repair(self, host, silent=False):
    689         """
    690         Run the repair list on the given host.
    691 
    692         @param host     The target to be repaired.
    693         @param silent   If true, don't log host status records.
    694         """
    695         self._verify_root._reverify()
    696         for ra in self._repair_actions:
    697             try:
    698                 ra._repair_host(host, silent)
    699             except Exception as e:
    700                 # all logging and exception handling was done at
    701                 # lower levels
    702                 pass
    703         self._verify_root._verify_host(host, silent)
    704