Home | History | Annotate | Download | only in hosts
      1 # Copyright 2016 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """
      6 Framework for host verification and repair in Autotest.
      7 
      8 The framework provides implementation code in support of `Host.verify()`
      9 and `Host.repair()` used in Verify and Repair special tasks.
     10 
     11 The framework consists of these classes:
     12   * `Verifier`: A class representing a single verification check.
     13   * `RepairAction`: A class representing a repair operation that can fix
     14     a failed verification check.
     15   * `RepairStrategy`:  A class for organizing a collection of `Verifier`
     16     and `RepairAction` instances, and invoking them in order.
     17 
     18 Individual operations during verification and repair are handled by
     19 instances of `Verifier` and `RepairAction`.  `Verifier` objects are
     20 meant to test for specific conditions that may cause tests to fail.
     21 `RepairAction` objects provide operations designed to fix one or
     22 more failures identified by a `Verifier` object.
     23 """
     24 
     25 import collections
     26 import logging
     27 import re
     28 
     29 import common
     30 from autotest_lib.client.common_lib import error
     31 
     32 try:
     33     from chromite.lib import metrics
     34 except ImportError:
     35     from autotest_lib.client.bin.utils import metrics_mock as metrics
     36 
     37 #Regular experssion pattern to filter out unwanted hostname.
     38 _HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
     39 _DISALLOWED_HOSTNAME = 'disallowed_hostname'
     40 
     41 
     42 class AutoservVerifyError(error.AutoservError):
     43     """
     44     Generic Exception for failures from `Verifier` objects.
     45 
     46     Instances of this exception can be raised when a `verify()`
     47     method fails, if no more specific exception is available.
     48     """
     49     pass
     50 
     51 
     52 _DependencyFailure = collections.namedtuple(
     53         '_DependencyFailure', ('dependency', 'error', 'tag'))
     54 
     55 
     56 class AutoservVerifyDependencyError(error.AutoservError):
     57     """
     58     Exception raised for failures in dependencies.
     59 
     60     This exception is used to distinguish an original failure from a
     61     failure being passed back from a verification dependency.  That is,
     62     if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
     63     to signal that the original failure is further down the dependency
     64     chain.
     65 
     66     The `failures` argument to the constructor for this class is a set
     67     of instances of `_DependencyFailure`, each corresponding to one
     68     failed dependency:
     69       * The `dependency` attribute of each failure is the description
     70         of the failed dependency.
     71       * The `error` attribute of each failure is the string value of
     72         the exception from the failed dependency.
     73 
     74     Multiple methods in this module recognize and handle this exception
     75     specially.
     76 
     77     @property failures  Set of failures passed to the constructor.
     78     @property _node     Instance of `_DependencyNode` reporting the
     79                         failed dependencies.
     80     """
     81 
     82     def __init__(self, node, failures):
     83         """
     84         Constructor for `AutoservVerifyDependencyError`.
     85 
     86         @param node       Instance of _DependencyNode reporting the
     87                           failed dependencies.
     88         @param failures   List of failure tuples as described above.
     89         """
     90         super(AutoservVerifyDependencyError, self).__init__(
     91                 '\n'.join([f.error for f in failures]))
     92         self.failures = failures
     93         self._node = node
     94 
     95     def log_dependencies(self, action, deps):
     96         """
     97         Log an `AutoservVerifyDependencyError`.
     98 
     99         This writes a short summary of the dependency failures captured
    100         in this exception, using standard Python logging.
    101 
    102         The passed in `action` string plus `self._node.description`
    103         are logged at INFO level.  The `action` argument should
    104         introduce or describe an action relative to `self._node`.
    105 
    106         The passed in `deps` string and the description of each failed
    107         dependency in `self` are be logged at DEBUG level.  The `deps`
    108         argument is used to introduce the various failed dependencies.
    109 
    110         @param action   A string mentioning the action being logged
    111                         relative to `self._node`.
    112         @param deps     A string introducing the dependencies that
    113                         failed.
    114         """
    115         logging.info('%s: %s', action, self._node.description)
    116         logging.debug('%s:', deps)
    117         for failure in self.failures:
    118             logging.debug('    %s', failure.dependency)
    119 
    120 
    121 class AutoservRepairError(error.AutoservError):
    122     """
    123     Generic Exception for failures from `RepairAction` objects.
    124 
    125     Instances of this exception can be raised when a `repair()`
    126     method fails, if no more specific exception is available.
    127     """
    128     def __init__(self, description, tag):
    129         """
    130         @param description  Message describe the exception.
    131         @param tag          A short identifier used for metric purpose.
    132         """
    133         super(AutoservRepairError, self).__init__(description)
    134         self.tag = tag
    135 
    136 
    137 class _DependencyNode(object):
    138     """
    139     An object that can depend on verifiers.
    140 
    141     Both repair and verify operations have the notion of dependencies
    142     that must pass before the operation proceeds.  This class captures
    143     the shared behaviors required by both classes.
    144 
    145     @property tag               Short identifier to be used in logging.
    146     @property description       Text summary of this node's action, to be
    147                                 used in debug logs.
    148     @property _dependency_list  Dependency pre-requisites.
    149     """
    150 
    151     def __init__(self, tag, record_type, dependencies):
    152         self._dependency_list = dependencies
    153         self._tag = tag
    154         self._record_tag = record_type + '.' + tag
    155 
    156     def _record(self, host, silent, status_code, *record_args):
    157         """
    158         Log a status record for `host`.
    159 
    160         Call `host.record()` using the given status_code, and
    161         operation tag `self._record_tag`, plus any extra arguments in
    162         `record_args`.  Do nothing if `silent` is a true value.
    163 
    164         @param host         Host which will record the status record.
    165         @param silent       Don't record the event if this is a true
    166                             value.
    167         @param status_code  Value for the `status_code` parameter to
    168                             `host.record()`.
    169         @param record_args  Additional arguments to pass to
    170                             `host.record()`.
    171         """
    172         if not silent:
    173             host.record(status_code, None, self._record_tag,
    174                         *record_args)
    175 
    176     def _record_good(self, host, silent):
    177         """Log a 'GOOD' status line.
    178 
    179         @param host         Host which will record the status record.
    180         @param silent       Don't record the event if this is a true
    181                             value.
    182         """
    183         self._record(host, silent, 'GOOD')
    184 
    185     def _record_fail(self, host, silent, exc):
    186         """Log a 'FAIL' status line.
    187 
    188         @param host         Host which will record the status record.
    189         @param silent       Don't record the event if this is a true
    190                             value.
    191         @param exc          Exception describing the cause of failure.
    192         """
    193         self._record(host, silent, 'FAIL', str(exc))
    194 
    195     def _verify_list(self, host, verifiers, silent):
    196         """
    197         Test a list of verifiers against a given host.
    198 
    199         This invokes `_verify_host()` on every verifier in the given
    200         list.  If any verifier in the transitive closure of dependencies
    201         in the list fails, an `AutoservVerifyDependencyError` is raised
    202         containing the description of each failed verifier.  Only
    203         original failures are reported; verifiers that don't run due
    204         to a failed dependency are omitted.
    205 
    206         By design, original failures are logged once in `_verify_host()`
    207         when `verify()` originally fails.  The additional data gathered
    208         here is for the debug logs to indicate why a subsequent
    209         operation never ran.
    210 
    211         @param host       The host to be tested against the verifiers.
    212         @param verifiers  List of verifiers to be checked.
    213         @param silent     If true, don't log host status records.
    214 
    215         @raises AutoservVerifyDependencyError   Raised when at least
    216                         one verifier in the list has failed.
    217         """
    218         failures = set()
    219         for v in verifiers:
    220             try:
    221                 v._verify_host(host, silent)
    222             except AutoservVerifyDependencyError as e:
    223                 failures.update(e.failures)
    224             except Exception as e:
    225                 failures.add(_DependencyFailure(v.description, str(e), v.tag))
    226         if failures:
    227             raise AutoservVerifyDependencyError(self, failures)
    228 
    229     def _verify_dependencies(self, host, silent):
    230         """
    231         Verify that all of this node's dependencies pass for a host.
    232 
    233         @param host     The host to be verified.
    234         @param silent   If true, don't log host status records.
    235         """
    236         try:
    237             self._verify_list(host, self._dependency_list, silent)
    238         except AutoservVerifyDependencyError as e:
    239             e.log_dependencies(
    240                     'Skipping this operation',
    241                     'The following dependencies failed')
    242             raise
    243 
    244     @property
    245     def tag(self):
    246         """
    247         Tag for use in logging status records.
    248 
    249         This is a property with a short string used to identify the node
    250         in the 'status.log' file and during node construction.  The tag
    251         should contain only letters, digits, and '_' characters.  This
    252         tag is not used alone, but is combined with other identifiers,
    253         based on the operation being logged.
    254 
    255         @return A short identifier-like string.
    256         """
    257         return self._tag
    258 
    259     @property
    260     def description(self):
    261         """
    262         Text description of this node for log messages.
    263 
    264         This string will be logged with failures, and should describe
    265         the condition required for success.
    266 
    267         N.B. Subclasses are required to override this method, but we
    268         _don't_ raise NotImplementedError here.  Various methods fail in
    269         inscrutable ways if this method raises any exception, so for
    270         debugging purposes, it's better to return a default value.
    271 
    272         @return A descriptive string.
    273         """
    274         return ('Class %s fails to implement description().' %
    275                 type(self).__name__)
    276 
    277 
    278 class Verifier(_DependencyNode):
    279     """
    280     Abstract class embodying one verification check.
    281 
    282     A concrete subclass of `Verifier` provides a simple check that can
    283     determine a host's fitness for testing.  Failure indicates that the
    284     check found a problem that can cause at least one test to fail.
    285 
    286     `Verifier` objects are organized in a DAG identifying dependencies
    287     among operations.  The DAG controls ordering and prevents wasted
    288     effort:  If verification operation V2 requires that verification
    289     operation V1 pass, then a) V1 will run before V2, and b) if V1
    290     fails, V2 won't run at all.  The `_verify_host()` method ensures
    291     that all dependencies run and pass before invoking the `verify()`
    292     method.
    293 
    294     A `Verifier` object caches its result the first time it calls
    295     `verify()`.  Subsequent calls return the cached result, without
    296     re-running the check code.  The `_reverify()` method clears the
    297     cached result in the current node, and in all dependencies.
    298 
    299     Subclasses must supply these properties and methods:
    300       * `verify()`: This is the method to perform the actual
    301         verification check.
    302       * `description`:  A one-line summary of the verification check for
    303         debug log messages.
    304 
    305     Subclasses must override all of the above attributes; subclasses
    306     should not override or extend any other attributes of this class.
    307 
    308     The description string should be a simple sentence explaining what
    309     must be true for the verifier to pass.  Do not include a terminating
    310     period.  For example:
    311 
    312         Host is available via ssh
    313 
    314     The base class manages the following private data:
    315       * `_result`:  The cached result of verification.
    316       * `_dependency_list`:  The list of dependencies.
    317     Subclasses should not use these attributes.
    318 
    319     @property _result           Cached result of verification.
    320     """
    321 
    322     def __init__(self, tag, dependencies):
    323         super(Verifier, self).__init__(tag, 'verify', dependencies)
    324         self._result = None
    325 
    326     def _reverify(self):
    327         """
    328         Discard cached verification results.
    329 
    330         Reset the cached verification result for this node, and for the
    331         transitive closure of all dependencies.
    332         """
    333         if self._result is not None:
    334             self._result = None
    335             for v in self._dependency_list:
    336                 v._reverify()
    337 
    338     def _verify_host(self, host, silent):
    339         """
    340         Determine the result of verification, and log results.
    341 
    342         If this verifier does not have a cached verification result,
    343         check dependencies, and if they pass, run `verify()`.  Log
    344         informational messages regarding failed dependencies.  If we
    345         call `verify()`, log the result in `status.log`.
    346 
    347         If we already have a cached result, return that result without
    348         logging any message.
    349 
    350         @param host     The host to be tested for a problem.
    351         @param silent   If true, don't log host status records.
    352         """
    353         if self._result is not None:
    354             if isinstance(self._result, Exception):
    355                 raise self._result  # cached failure
    356             elif self._result:
    357                 return              # cached success
    358         self._result = False
    359         self._verify_dependencies(host, silent)
    360         logging.info('Verifying this condition: %s', self.description)
    361         try:
    362             self.verify(host)
    363             self._record_good(host, silent)
    364         except Exception as e:
    365             logging.exception('Failed: %s', self.description)
    366             self._result = e
    367             self._record_fail(host, silent, e)
    368             raise
    369         self._result = True
    370 
    371     def verify(self, host):
    372         """
    373         Unconditionally perform a verification check.
    374 
    375         This method is responsible for testing for a single problem on a
    376         host.  Implementations should follow these guidelines:
    377           * The check should find a problem that will cause testing to
    378             fail.
    379           * Verification checks on a working system should run quickly
    380             and should be optimized for success; a check that passes
    381             should finish within seconds.
    382           * Verification checks are not expected have side effects, but
    383             may apply trivial fixes if they will finish within the time
    384             constraints above.
    385 
    386         A verification check should normally trigger a single set of
    387         repair actions.  If two different failures can require two
    388         different repairs, ideally they should use two different
    389         subclasses of `Verifier`.
    390 
    391         Implementations indicate failure by raising an exception.  The
    392         exception text should be a short, 1-line summary of the error.
    393         The text should be concise and diagnostic, as it will appear in
    394         `status.log` files.
    395 
    396         If this method finds no problems, it returns without raising any
    397         exception.
    398 
    399         Implementations should avoid most logging actions, but can log
    400         DEBUG level messages if they provide significant information for
    401         diagnosing failures.
    402 
    403         @param host   The host to be tested for a problem.
    404         """
    405         raise NotImplementedError('Class %s does not implement '
    406                                   'verify()' % type(self).__name__)
    407 
    408 
    409 class RepairAction(_DependencyNode):
    410     """
    411     Abstract class embodying one repair procedure.
    412 
    413     A `RepairAction` is responsible for fixing one or more failed
    414     `Verifier` checks, in order to make those checks pass.
    415 
    416     Each repair action includes one or more verifier triggers that
    417     determine when the repair action should run.  A repair action
    418     will call its `repair()` method if one or more of its triggers
    419     fails.  A repair action is successful if all of its triggers pass
    420     after calling `repair()`.
    421 
    422     A `RepairAction` is a subclass of `_DependencyNode`; if any of a
    423     repair action's dependencies fail, the action does not check its
    424     triggers, and doesn't call `repair()`.
    425 
    426     Subclasses must supply these attributes:
    427       * `repair()`: This is the method to perform the necessary
    428         repair.  The method should avoid most logging actions, but
    429         can log DEBUG level messages if they provide significant
    430         information for diagnosing failures.
    431       * `description`:  A one-line summary of the repair action for
    432         debug log messages.
    433 
    434     Subclasses must override both of the above attributes and should
    435     not override any other attributes of this class.
    436 
    437     The description string should be a simple sentence explaining the
    438     operation that will be performed.  Do not include a terminating
    439     period.  For example:
    440 
    441         Re-install the stable build via AU
    442 
    443     @property _trigger_list   List of verification checks that will
    444                               trigger this repair when they fail.
    445     @property host_class      A string identifier that will be
    446                               used as a field to send repair metrics.
    447     """
    448 
    449     def __init__(self, tag, dependencies, triggers, host_class):
    450         super(RepairAction, self).__init__(tag, 'repair', dependencies)
    451         self._trigger_list = triggers
    452         self._failure_modes_counter = metrics.Counter(
    453             'chromeos/autotest/repair/failure_modes')
    454         self._failure_detail_counter = metrics.Counter(
    455             'chromeos/autotest/repair/failure_detail')
    456         self.host_class = host_class
    457 
    458     def _record_start(self, host, silent):
    459         """Log a 'START' status line.
    460 
    461         @param host         Host which will record the status record.
    462         @param silent       Don't record the event if this is a true
    463                             value.
    464         """
    465         self._record(host, silent, 'START')
    466 
    467     def _record_end_good(self, host, silent):
    468         """Log an 'END GOOD' status line.
    469 
    470         @param host         Host which will record the status record.
    471         @param silent       Don't record the event if this is a true
    472                             value.
    473         """
    474         self._record(host, silent, 'END GOOD')
    475         self.status = 'repaired'
    476 
    477     def _record_end_fail(self, host, silent, status, *args):
    478         """Log an 'END FAIL' status line.
    479 
    480         @param host         Host which will record the status record.
    481         @param silent       Don't record the event if this is a true
    482                             value.
    483         @param args         Extra arguments to `self._record()`
    484         """
    485         self._record(host, silent, 'END FAIL', *args)
    486         self.status = status
    487 
    488     def _send_failure_metrics(self, host, error, stage):
    489         """Send failure mode metrics to monarch
    490 
    491         @param host         Host which this RepairAction targeted to.
    492         @param error        An exception that caught in _repair_host.
    493         @param stage        In which stage we caught above exception.
    494                             Can be one of below value:
    495                                 'dep'    during verify dependencies
    496                                 'pre'    during pre-repair trigger verification
    497                                 'repair' during repair() process itself
    498                                 'post'   during post-repair trigger verification
    499         """
    500 
    501         def get_fields(vf_tag):
    502             fields = {
    503                 'ra_tag': self.tag,
    504                 'vf_tag': vf_tag,
    505                 'hostname': _filter_metrics_hostname(host),
    506                 'stage': stage,
    507                 'host_class': self.host_class
    508             }
    509             return fields
    510 
    511         if isinstance(error, AutoservVerifyDependencyError):
    512             # We'll catch all failure tags here for a dependencies error
    513             for f in error.failures:
    514                 self._failure_modes_counter.increment(fields=get_fields(f.tag))
    515         else:
    516             # When there is failure during repair or unknown failure. there
    517             # will be no Verifier, so vf_tag set to 'unknown'.
    518             self._failure_modes_counter.increment(fields=get_fields('unknown'))
    519 
    520         if stage == 'repair':
    521             self._send_failure_detail(error)
    522 
    523     def _send_failure_detail(self, error):
    524         """Send reason of failure inside repair() to monarch.
    525 
    526         @param error    The exception caught inside repair().
    527         """
    528         tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
    529         fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
    530         self._failure_detail_counter.increment(fields=fields)
    531 
    532     def _repair_host(self, host, silent):
    533         """
    534         Apply this repair action if any triggers fail.
    535 
    536         Repair is triggered when all dependencies are successful, and at
    537         least one trigger fails.
    538 
    539         If the `repair()` method triggers, the success or failure of
    540         this operation is logged in `status.log` bracketed by 'START'
    541         and 'END' records.  Details of whether or why `repair()`
    542         triggered are written to the debug logs.   If repair doesn't
    543         trigger, nothing is logged to `status.log`.
    544 
    545         @param host     The host to be repaired.
    546         @param silent   If true, don't log host status records.
    547         """
    548         # Note:  Every exit path from the method must set `self.status`.
    549         # There's a lot of exit paths, so be careful.
    550         #
    551         # If we're blocked by a failed dependency, we exit with an
    552         # exception.  So set status to 'blocked' first.
    553         self.status = 'blocked'
    554         try:
    555             self._verify_dependencies(host, silent)
    556         except Exception as e:
    557             self._send_failure_metrics(host, e, 'dep')
    558             raise
    559         # This is a defensive action.  Every path below should overwrite
    560         # this setting, but if it doesn't, we want our status to reflect
    561         # a coding error.
    562         self.status = 'unknown'
    563         try:
    564             self._verify_list(host, self._trigger_list, silent)
    565         except AutoservVerifyDependencyError as e:
    566             e.log_dependencies(
    567                     'Attempting this repair action',
    568                     'Repairing because these triggers failed')
    569             self._send_failure_metrics(host, e, 'pre')
    570             self._record_start(host, silent)
    571             try:
    572                 self.repair(host)
    573             except Exception as e:
    574                 logging.exception('Repair failed: %s', self.description)
    575                 self._record_fail(host, silent, e)
    576                 self._record_end_fail(host, silent, 'repair_failure')
    577                 self._send_failure_metrics(host, e, 'repair')
    578                 raise
    579             try:
    580                 for v in self._trigger_list:
    581                     v._reverify()
    582                 self._verify_list(host, self._trigger_list, silent)
    583                 self._record_end_good(host, silent)
    584             except AutoservVerifyDependencyError as e:
    585                 e.log_dependencies(
    586                         'This repair action reported success',
    587                         'However, these triggers still fail')
    588                 self._record_end_fail(host, silent, 'verify_failure')
    589                 self._send_failure_metrics(host, e, 'post')
    590                 raise AutoservRepairError(
    591                         'Some verification checks still fail', 'post_verify')
    592             except Exception:
    593                 # The specification for `self._verify_list()` says
    594                 # that this can't happen; this is a defensive
    595                 # precaution.
    596                 self._record_end_fail(host, silent, 'unknown',
    597                                       'Internal error in repair')
    598                 self._send_failure_metrics(host, e, 'post')
    599                 raise
    600         else:
    601             self.status = 'skipped'
    602             logging.info('No failed triggers, skipping repair:  %s',
    603                          self.description)
    604 
    605     def repair(self, host):
    606         """
    607         Apply this repair action to the given host.
    608 
    609         This method is responsible for applying changes to fix failures
    610         in one or more verification checks.  The repair is considered
    611         successful if the DUT passes the specific checks after this
    612         method completes.
    613 
    614         Implementations indicate failure by raising an exception.  The
    615         exception text should be a short, 1-line summary of the error.
    616         The text should be concise and diagnostic, as it will appear in
    617         `status.log` files.
    618 
    619         If this method completes successfully, it returns without
    620         raising any exception.
    621 
    622         Implementations should avoid most logging actions, but can log
    623         DEBUG level messages if they provide significant information for
    624         diagnosing failures.
    625 
    626         @param host   The host to be repaired.
    627         """
    628         raise NotImplementedError('Class %s does not implement '
    629                                   'repair()' % type(self).__name__)
    630 
    631 
    632 class _RootVerifier(Verifier):
    633     """
    634     Utility class used by `RepairStrategy`.
    635 
    636     A node of this class by itself does nothing; it always passes (if it
    637     can run).  This class exists merely to be the root of a DAG of
    638     dependencies in an instance of `RepairStrategy`.
    639     """
    640 
    641     def verify(self, host):
    642         pass
    643 
    644     @property
    645     def description(self):
    646         return 'All host verification checks pass'
    647 
    648 
    649 class RepairStrategy(object):
    650     """
    651     A class for organizing `Verifier` and `RepairAction` objects.
    652 
    653     An instance of `RepairStrategy` is organized as a DAG of `Verifier`
    654     objects, plus a list of `RepairAction` objects.  The class provides
    655     methods for invoking those objects in the required order, when
    656     needed:
    657       * The `verify()` method walks the verifier DAG in dependency
    658         order.
    659       * The `repair()` method invokes the repair actions in list order.
    660         Each repair action will invoke its dependencies and triggers as
    661         needed.
    662 
    663     # The Verifier DAG
    664     The verifier DAG is constructed from the first argument passed to
    665     the passed to the `RepairStrategy` constructor.  That argument is an
    666     iterable consisting of three-element tuples in the form
    667     `(constructor, tag, deps)`:
    668       * The `constructor` value is a callable that creates a `Verifier`
    669         as for the interface of the class constructor.  For classes
    670         that inherit the default constructor from `Verifier`, this can
    671         be the class itself.
    672       * The `tag` value is the tag to be associated with the constructed
    673         verifier.
    674       * The `deps` value is an iterable (e.g. list or tuple) of strings.
    675         Each string corresponds to the `tag` member of a `Verifier`
    676         dependency.
    677 
    678     The tag names of verifiers in the constructed DAG must all be
    679     unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
    680     reserved and may not be used by any verifier.
    681 
    682     In the input data for the constructor, dependencies must appear
    683     before the nodes that depend on them.  Thus:
    684 
    685         ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
    686         ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
    687 
    688     Internally, the DAG of verifiers is given unique root node.  So,
    689     given this input:
    690 
    691         ((C, 'c', ()),
    692          (A, 'a', ('c',)),
    693          (B, 'b', ('c',)))
    694 
    695     The following DAG is constructed:
    696 
    697           Root
    698           /  \
    699          A    B
    700           \  /
    701            C
    702 
    703     Since nothing depends on `A` or `B`, the root node guarantees that
    704     these two verifiers will both be called and properly logged.
    705 
    706     The root node is not directly accessible; however repair actions can
    707     trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
    708     node will be logged in `status.log` whenever `verify()` succeeds.
    709 
    710     # The Repair Actions List
    711     The list of repair actions is constructed from the second argument
    712     passed to the passed to the `RepairStrategy` constructor.  That
    713     argument is an iterable consisting of four-element tuples in the
    714     form `(constructor, tag, deps, triggers)`:
    715       * The `constructor` value is a callable that creates a
    716         `RepairAction` as for the interface of the class constructor.
    717         For classes that inherit the default constructor from
    718         `RepairAction`, this can be the class itself.
    719       * The `tag` value is the tag to be associated with the constructed
    720         repair action.
    721       * The `deps` value is an iterable (e.g. list or tuple) of strings.
    722         Each string corresponds to the `tag` member of a `Verifier` that
    723         the repair action depends on.
    724       * The `triggers` value is an iterable (e.g. list or tuple) of
    725         strings.  Each string corresponds to the `tag` member of a
    726         `Verifier` that can trigger the repair action.
    727 
    728     `RepairStrategy` deps and triggers can only refer to verifiers,
    729     not to other repair actions.
    730     """
    731 
    732     # This name is reserved; clients may not use it.
    733     ROOT_TAG = 'PASS'
    734 
    735     @staticmethod
    736     def _add_verifier(verifiers, constructor, tag, dep_tags):
    737         """
    738         Construct and remember a verifier.
    739 
    740         Create a `Verifier` using `constructor` and `tag`.  Dependencies
    741         for construction are found by looking up `dep_tags` in the
    742         `verifiers` dictionary.
    743 
    744         After construction, the new verifier is added to `verifiers`.
    745 
    746         @param verifiers    Dictionary of verifiers, indexed by tag.
    747         @param constructor  Verifier construction function.
    748         @param tag          Tag parameter for the construction function.
    749         @param dep_tags     Tags of dependencies for the constructor, to
    750                             be found in `verifiers`.
    751         """
    752         assert tag not in verifiers
    753         deps = [verifiers[d] for d in dep_tags]
    754         verifiers[tag] = constructor(tag, deps)
    755 
    756     def __init__(self, verifier_data, repair_data, host_class):
    757         """
    758         Construct a `RepairStrategy` from simplified DAG data.
    759 
    760         The input `verifier_data` object describes how to construct
    761         verify nodes and the dependencies that relate them, as detailed
    762         above.
    763 
    764         The input `repair_data` object describes how to construct repair
    765         actions and their dependencies and triggers, as detailed above.
    766 
    767         @param verifier_data  Iterable value with constructors for the
    768                               elements of the verification DAG and their
    769                               dependencies.
    770         @param repair_data    Iterable value with constructors for the
    771                               elements of the repair action list, and
    772                               their dependencies and triggers.
    773         @property host_class  A string identifier that identify what
    774                               class of host this repair strategy target
    775                               on, will be used as a field to send repair
    776                               metrics.
    777         """
    778         # Metrics - we report on 'actions' for every repair action
    779         # we execute; we report on 'strategy' for every complete
    780         # repair operation.
    781         self._strategy_counter = metrics.Counter(
    782             'chromeos/autotest/repair/repair_strategy_v2')
    783         self._actions_counter = metrics.Counter(
    784             'chromeos/autotest/repair/repair_actions')
    785         self.host_class = host_class
    786         # We use the `all_verifiers` list to guarantee that our root
    787         # verifier will execute its dependencies in the order provided
    788         # to us by our caller.
    789         verifier_map = {}
    790         all_tags = []
    791         dependencies = set()
    792         for constructor, tag, deps in verifier_data:
    793             self._add_verifier(verifier_map, constructor, tag, deps)
    794             dependencies.update(deps)
    795             all_tags.append(tag)
    796         # Capture all the verifiers that have nothing depending on them.
    797         root_tags = [t for t in all_tags if t not in dependencies]
    798         self._add_verifier(verifier_map, _RootVerifier,
    799                            self.ROOT_TAG, root_tags)
    800         self._verify_root = verifier_map[self.ROOT_TAG]
    801         self._repair_actions = []
    802         for constructor, tag, deps, triggers in repair_data:
    803             r = constructor(tag,
    804                             [verifier_map[d] for d in deps],
    805                             [verifier_map[t] for t in triggers],
    806                             self.host_class)
    807             self._repair_actions.append(r)
    808 
    809     def _send_strategy_metrics(self, host, result):
    810         """Send repair strategy metrics to monarch
    811 
    812         @param host     The target to be repaired.
    813         @param result   A String that describe a final result for the
    814                         RepairStrategy.
    815         """
    816         info = host.host_info_store.get()
    817         board = info.board if info.board else 'unknown'
    818         model = info.model if info.model else 'unknown'
    819         fields = {
    820             'board': board,
    821             'host_class': self.host_class,
    822             'hostname': _filter_metrics_hostname(host),
    823             'model': model,
    824             'result': result,
    825         }
    826         self._strategy_counter.increment(fields=fields)
    827 
    828     def _send_action_metrics(self, host, ra):
    829         """Send repair action metrics to monarch
    830 
    831         @param host     The target to be repaired.
    832         @param ra       an RepairAction instance.
    833         """
    834         fields = {
    835             'tag': ra.tag,
    836             'status': ra.status,
    837             'hostname': _filter_metrics_hostname(host),
    838             'host_class': self.host_class
    839         }
    840         self._actions_counter.increment(fields=fields)
    841 
    842     def verify(self, host, silent=False):
    843         """
    844         Run the verifier DAG on the given host.
    845 
    846         @param host     The target to be verified.
    847         @param silent   If true, don't log host status records.
    848         """
    849         self._verify_root._reverify()
    850         self._verify_root._verify_host(host, silent)
    851 
    852     def repair(self, host, silent=False):
    853         """
    854         Run the repair list on the given host.
    855 
    856         @param host     The target to be repaired.
    857         @param silent   If true, don't log host status records.
    858         """
    859         self._verify_root._reverify()
    860         attempted = False
    861         for ra in self._repair_actions:
    862             try:
    863                 ra._repair_host(host, silent)
    864             except Exception as e:
    865                 # all logging and exception handling was done at
    866                 # lower levels
    867                 pass
    868             finally:
    869                 self._send_action_metrics(host, ra)
    870                 if ra.status not in ('skipped', 'blocked'):
    871                     attempted = True
    872 
    873         result = 'failure'
    874         try:
    875             self._verify_root._verify_host(host, silent)
    876             result = 'success' if attempted else 'not_attempted'
    877         except:
    878             if not attempted:
    879                 result = 'attempt_blocked'
    880             raise
    881         finally:
    882             self._send_strategy_metrics(host, result)
    883 
    884 
    885 def _filter_metrics_hostname(host):
    886     """
    887        Restrict format of hostnames we'll send to monarch
    888 
    889        @param host    An host instance(i.e. ServoHost, CrosHost)
    890     """
    891     if re.match(_HOSTNAME_PATTERN, host.hostname):
    892         return host.hostname
    893     else:
    894         return _DISALLOWED_HOSTNAME
    895 
    896