Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Create e-mail reports of the Lab's DUT inventory.
      7 
      8 Gathers a list of all DUTs of interest in the Lab, segregated by
      9 model and pool, and determines whether each DUT is working or
     10 broken.  Then, send one or more e-mail reports summarizing the
     11 status to e-mail addresses provided on the command line.
     12 
     13 usage:  lab_inventory.py [ options ] [ model ... ]
     14 
     15 Options:
     16 --duration / -d <hours>
     17     How far back in time to search job history to determine DUT
     18     status.
     19 
     20 --model-notify <address>[,<address>]
     21     Send the "model status" e-mail to all the specified e-mail
     22     addresses.
     23 
     24 --pool-notify <address>[,<address>]
     25     Send the "pool status" e-mail to all the specified e-mail
     26     addresses.
     27 
     28 --recommend <number>
     29     When generating the "model status" e-mail, include a list of
     30     <number> specific DUTs to be recommended for repair.
     31 
     32 --report-untestable
     33     Scan the inventory for DUTs that can't test because they're stuck in
     34     repair loops, or because the scheduler can't give them work.
     35 
     36 --logdir <directory>
     37     Log progress and actions in a file under this directory.  Text
     38     of any e-mail sent will also be logged in a timestamped file in
     39     this directory.
     40 
     41 --debug
     42     Suppress all logging, metrics reporting, and sending e-mail.
     43     Instead, write the output that would be generated onto stdout.
     44 
     45 <model> arguments:
     46     With no arguments, gathers the status for all models in the lab.
     47     With one or more named models on the command line, restricts
     48     reporting to just those models.
     49 """
     50 
     51 
     52 import argparse
     53 import collections
     54 import logging
     55 import logging.handlers
     56 import os
     57 import re
     58 import sys
     59 import time
     60 
     61 import common
     62 from autotest_lib.client.bin import utils
     63 from autotest_lib.client.common_lib import time_utils
     64 from autotest_lib.frontend.afe.json_rpc import proxy
     65 from autotest_lib.server import constants
     66 from autotest_lib.server import site_utils
     67 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     68 from autotest_lib.server.hosts import servo_host
     69 from autotest_lib.server.lib import status_history
     70 from autotest_lib.site_utils import gmail_lib
     71 from chromite.lib import metrics
     72 
     73 
     74 CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
     75 SPARE_POOL = constants.Pools.SPARE_POOL
     76 MANAGED_POOLS = constants.Pools.MANAGED_POOLS
     77 
     78 # _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
     79 #     monitoring by this script.  Currently, we're excluding these:
     80 #   + 'adb' - We're not ready to monitor Android or Brillo hosts.
     81 #   + 'board:guado_moblab' - These are maintained by a separate
     82 #     process that doesn't use this script.
     83 #   + 'board:veyron_rialto' due to crbug.com/854404
     84 
     85 _EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
     86                     'board:veyron_rialto'}
     87 
     88 # _DEFAULT_DURATION:
     89 #     Default value used for the --duration command line option.
     90 #     Specifies how far back in time to search in order to determine
     91 #     DUT status.
     92 
     93 _DEFAULT_DURATION = 24
     94 
     95 # _LOGDIR:
     96 #     Relative path used in the calculation of the default setting for
     97 #     the --logdir option.  The full path is relative to the root of the
     98 #     autotest directory, as determined from sys.argv[0].
     99 # _LOGFILE:
    100 #     Basename of a file to which general log information will be
    101 #     written.
    102 # _LOG_FORMAT:
    103 #     Format string for log messages.
    104 
    105 _LOGDIR = os.path.join('logs', 'dut-data')
    106 _LOGFILE = 'lab-inventory.log'
    107 _LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
    108 
    109 # Pattern describing location-based host names in the Chrome OS test
    110 # labs.  Each DUT hostname designates the DUT's location:
    111 #   * A lab (room) that's physically separated from other labs
    112 #     (i.e. there's a door).
    113 #   * A row (or aisle) of DUTs within the lab.
    114 #   * A vertical rack of shelves on the row.
    115 #   * A specific host on one shelf of the rack.
    116 
    117 _HOSTNAME_PATTERN = re.compile(
    118         r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
    119 
    120 # _REPAIR_LOOP_THRESHOLD:
    121 #    The number of repeated Repair tasks that must be seen to declare
    122 #    that a DUT is stuck in a repair loop.
    123 
    124 _REPAIR_LOOP_THRESHOLD = 4
    125 
    126 
    127 _METRICS_PREFIX = 'chromeos/autotest/inventory'
    128 _UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
    129     _METRICS_PREFIX + '/untestable',
    130     'DUTs that cannot be scheduled for testing')
    131 
    132 _MISSING_DUT_METRIC = metrics.Counter(
    133     _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
    134     ' because they are invalid or deleted')
    135 
    136 # _Diagnosis - namedtuple corresponding to the return value from
    137 # `HostHistory.last_diagnosis()`
    138 _Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])
    139 
    140 def _get_diagnosis(history):
    141     dut_present = True
    142     try:
    143         diagnosis = _Diagnosis(*history.last_diagnosis())
    144         if (diagnosis.status == status_history.BROKEN
    145                 and diagnosis.task.end_time < history.start_time):
    146             return _Diagnosis(status_history.UNUSED, diagnosis.task)
    147         else:
    148             return diagnosis
    149     except proxy.JSONRPCException as e:
    150         logging.warn(e)
    151         dut_present = False
    152     finally:
    153         _MISSING_DUT_METRIC.increment(
    154             fields={'host': history.hostname, 'presence': dut_present})
    155     return _Diagnosis(None, None)
    156 
    157 
    158 def _host_is_working(history):
    159     return _get_diagnosis(history).status == status_history.WORKING
    160 
    161 
    162 def _host_is_broken(history):
    163     return _get_diagnosis(history).status == status_history.BROKEN
    164 
    165 
    166 def _host_is_idle(history):
    167     idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
    168     return _get_diagnosis(history).status in idle_statuses
    169 
    170 
    171 class _HostSetInventory(object):
    172     """Maintains a set of related `HostJobHistory` objects.
    173 
    174     Current usage of this class is that all DUTs are part of a single
    175     scheduling pool of DUTs for a single model; however, this class make
    176     no assumptions about the actual relationship among the DUTs.
    177 
    178     The collection is segregated into disjoint categories of "working",
    179     "broken", and "idle" DUTs.  Accessor methods allow finding both the
    180     list of DUTs in each category, as well as counts of each category.
    181 
    182     Performance note:  Certain methods in this class are potentially
    183     expensive:
    184       * `get_working()`
    185       * `get_working_list()`
    186       * `get_broken()`
    187       * `get_broken_list()`
    188       * `get_idle()`
    189       * `get_idle_list()`
    190     The first time any one of these methods is called, it causes
    191     multiple RPC calls with a relatively expensive set of database
    192     queries.  However, the results of the queries are cached in the
    193     individual `HostJobHistory` objects, so only the first call
    194     actually pays the full cost.
    195 
    196     Additionally, `get_working_list()`, `get_broken_list()` and
    197     `get_idle_list()` cache their return values to avoid recalculating
    198     lists at every call; this caching is separate from the caching of
    199     RPC results described above.
    200 
    201     This class is deliberately constructed to delay the RPC cost until
    202     the accessor methods are called (rather than to query in
    203     `record_host()`) so that it's possible to construct a complete
    204     `_LabInventory` without making the expensive queries at creation
    205     time.  `_populate_model_counts()`, below, assumes this behavior.
    206     """
    207 
    208     def __init__(self):
    209         self._histories = []
    210         self._working_list = None
    211         self._broken_list = None
    212         self._idle_list = None
    213 
    214     def record_host(self, host_history):
    215         """Add one `HostJobHistory` object to the collection.
    216 
    217         @param host_history The `HostJobHistory` object to be
    218                             remembered.
    219         """
    220         self._working_list = None
    221         self._broken_list = None
    222         self._idle_list = None
    223         self._histories.append(host_history)
    224 
    225     def get_working_list(self):
    226         """Return a list of all working DUTs in the pool.
    227 
    228         Filter `self._histories` for histories where the DUT is
    229         diagnosed as working.
    230 
    231         Cache the result so that we only cacluate it once.
    232 
    233         @return A list of HostJobHistory objects.
    234         """
    235         if self._working_list is None:
    236             self._working_list = [h for h in self._histories
    237                                   if _host_is_working(h)]
    238         return self._working_list
    239 
    240     def get_working(self):
    241         """Return the number of working DUTs in the pool."""
    242         return len(self.get_working_list())
    243 
    244     def get_broken_list(self):
    245         """Return a list of all broken DUTs in the pool.
    246 
    247         Filter `self._histories` for histories where the DUT is
    248         diagnosed as broken.
    249 
    250         Cache the result so that we only cacluate it once.
    251 
    252         @return A list of HostJobHistory objects.
    253         """
    254         if self._broken_list is None:
    255             self._broken_list = [h for h in self._histories
    256                                  if _host_is_broken(h)]
    257         return self._broken_list
    258 
    259     def get_broken(self):
    260         """Return the number of broken DUTs in the pool."""
    261         return len(self.get_broken_list())
    262 
    263     def get_idle_list(self):
    264         """Return a list of all idle DUTs in the pool.
    265 
    266         Filter `self._histories` for histories where the DUT is
    267         diagnosed as idle.
    268 
    269         Cache the result so that we only cacluate it once.
    270 
    271         @return A list of HostJobHistory objects.
    272         """
    273         if self._idle_list is None:
    274             self._idle_list = [h for h in self._histories
    275                                if _host_is_idle(h)]
    276         return self._idle_list
    277 
    278     def get_idle(self):
    279         """Return the number of idle DUTs in the pool."""
    280         return len(self.get_idle_list())
    281 
    282     def get_total(self):
    283         """Return the total number of DUTs in the pool."""
    284         return len(self._histories)
    285 
    286     def get_all_histories(self):
    287         return self._histories
    288 
    289 
    290 class _PoolSetInventory(object):
    291     """Maintains a set of `HostJobHistory`s for a set of pools.
    292 
    293     The collection is segregated into disjoint categories of "working",
    294     "broken", and "idle" DUTs.  Accessor methods allow finding both the
    295     list of DUTs in each category, as well as counts of each category.
    296     Accessor queries can be for an individual pool, or against all
    297     pools.
    298 
    299     Performance note:  This class relies on `_HostSetInventory`.  Public
    300     methods in this class generally rely on methods of the same name in
    301     the underlying class, and so will have the same underlying
    302     performance characteristics.
    303     """
    304 
    305     def __init__(self, pools):
    306         self._histories_by_pool = {
    307             pool: _HostSetInventory() for pool in pools
    308         }
    309 
    310     def record_host(self, host_history):
    311         """Add one `HostJobHistory` object to the collection.
    312 
    313         @param host_history The `HostJobHistory` object to be
    314                             remembered.
    315         """
    316         pool = host_history.host_pool
    317         self._histories_by_pool[pool].record_host(host_history)
    318 
    319     def _count_pool(self, get_pool_count, pool=None):
    320         """Internal helper to count hosts in a given pool.
    321 
    322         The `get_pool_count` parameter is a function to calculate
    323         the exact count of interest for the pool.
    324 
    325         @param get_pool_count  Function to return a count from a
    326                                _PoolCount object.
    327         @param pool            The pool to be counted.  If `None`,
    328                                return the total across all pools.
    329         """
    330         if pool is None:
    331             return sum([get_pool_count(cached_history) for cached_history in
    332                         self._histories_by_pool.values()])
    333         else:
    334             return get_pool_count(self._histories_by_pool[pool])
    335 
    336     def get_working_list(self):
    337         """Return a list of all working DUTs (across all pools).
    338 
    339         Go through all HostJobHistory objects across all pools,
    340         selecting all DUTs identified as working.
    341 
    342         @return A list of HostJobHistory objects.
    343         """
    344         l = []
    345         for p in self._histories_by_pool.values():
    346             l.extend(p.get_working_list())
    347         return l
    348 
    349     def get_working(self, pool=None):
    350         """Return the number of working DUTs in a pool.
    351 
    352         @param pool  The pool to be counted.  If `None`, return the
    353                      total across all pools.
    354 
    355         @return The total number of working DUTs in the selected
    356                 pool(s).
    357         """
    358         return self._count_pool(_HostSetInventory.get_working, pool)
    359 
    360     def get_broken_list(self):
    361         """Return a list of all broken DUTs (across all pools).
    362 
    363         Go through all HostJobHistory objects across all pools,
    364         selecting all DUTs identified as broken.
    365 
    366         @return A list of HostJobHistory objects.
    367         """
    368         l = []
    369         for p in self._histories_by_pool.values():
    370             l.extend(p.get_broken_list())
    371         return l
    372 
    373     def get_broken(self, pool=None):
    374         """Return the number of broken DUTs in a pool.
    375 
    376         @param pool  The pool to be counted.  If `None`, return the
    377                      total across all pools.
    378 
    379         @return The total number of broken DUTs in the selected pool(s).
    380         """
    381         return self._count_pool(_HostSetInventory.get_broken, pool)
    382 
    383     def get_idle_list(self, pool=None):
    384         """Return a list of all idle DUTs in the given pool.
    385 
    386         Go through all HostJobHistory objects across all pools,
    387         selecting all DUTs identified as idle.
    388 
    389         @param pool: The pool to be counted. If `None`, return the total list
    390                      across all pools.
    391 
    392         @return A list of HostJobHistory objects.
    393         """
    394         if pool is None:
    395             l = []
    396             for p in self._histories_by_pool.itervalues():
    397                 l.extend(p.get_idle_list())
    398             return l
    399         else:
    400             return self._histories_by_pool[pool].get_idle_list()
    401 
    402     def get_idle(self, pool=None):
    403         """Return the number of idle DUTs in a pool.
    404 
    405         @param pool: The pool to be counted. If `None`, return the total
    406                      across all pools.
    407 
    408         @return The total number of idle DUTs in the selected pool(s).
    409         """
    410         return self._count_pool(_HostSetInventory.get_idle, pool)
    411 
    412     def get_spares_buffer(self, spare_pool=SPARE_POOL):
    413         """Return the the nominal number of working spares.
    414 
    415         Calculates and returns how many working spares there would
    416         be in the spares pool if all broken DUTs were in the spares
    417         pool.  This number may be negative, indicating a shortfall
    418         in the critical pools.
    419 
    420         @return The total number DUTs in the spares pool, less the total
    421                 number of broken DUTs in all pools.
    422         """
    423         return self.get_total(spare_pool) - self.get_broken()
    424 
    425     def get_total(self, pool=None):
    426         """Return the total number of DUTs in a pool.
    427 
    428         @param pool  The pool to be counted.  If `None`, return the
    429                      total across all pools.
    430 
    431         @return The total number of DUTs in the selected pool(s).
    432         """
    433         return self._count_pool(_HostSetInventory.get_total, pool)
    434 
    435     def get_all_histories(self, pool=None):
    436         if pool is None:
    437             for p in self._histories_by_pool.itervalues():
    438                 for h in p.get_all_histories():
    439                     yield h
    440         else:
    441             for h in self._histories_by_pool[pool].get_all_histories():
    442                 yield h
    443 
    444 
    445 def _is_migrated_to_skylab(afehost):
    446     """Return True if the provided frontend.Host has been migrated to skylab."""
    447     return afehost.hostname.endswith('-migrated-do-not-use')
    448 
    449 
    450 def _eligible_host(afehost):
    451     """Return whether this host is eligible for monitoring.
    452 
    453     @param afehost  The host to be tested for eligibility.
    454     """
    455     if _is_migrated_to_skylab(afehost):
    456         return False
    457 
    458     # DUTs without an existing, unique 'model' or 'pool' label aren't meant to
    459     # exist in the managed inventory; their presence generally indicates an
    460     # error in the database. The _LabInventory constructor requires hosts to
    461     # conform to the label restrictions. Failing an inventory run for a single
    462     # bad entry is wrong, so we ignore these hosts.
    463     models = [l for l in afehost.labels
    464                  if l.startswith(constants.Labels.MODEL_PREFIX)]
    465     pools = [l for l in afehost.labels
    466                  if l.startswith(constants.Labels.POOL_PREFIX)]
    467     excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
    468     return len(models) == 1 and len(pools) == 1 and not excluded
    469 
    470 
    471 class _LabInventory(collections.Mapping):
    472     """Collection of `HostJobHistory` objects for the Lab's inventory.
    473 
    474     This is a dict-like collection indexed by model.  Indexing returns
    475     the _PoolSetInventory object associated with the model.
    476     """
    477 
    478     @classmethod
    479     def create_inventory(cls, afe, start_time, end_time, modellist=[]):
    480         """Return a Lab inventory with specified parameters.
    481 
    482         By default, gathers inventory from `HostJobHistory` objects for
    483         all DUTs in the `MANAGED_POOLS` list.  If `modellist` is
    484         supplied, the inventory will be restricted to only the given
    485         models.
    486 
    487         @param afe          AFE object for constructing the
    488                             `HostJobHistory` objects.
    489         @param start_time   Start time for the `HostJobHistory` objects.
    490         @param end_time     End time for the `HostJobHistory` objects.
    491         @param modellist    List of models to include.  If empty,
    492                             include all available models.
    493         @return A `_LabInventory` object for the specified models.
    494         """
    495         target_pools = MANAGED_POOLS
    496         label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
    497         afehosts = afe.get_hosts(labels__name__in=label_list)
    498         if modellist:
    499             # We're deliberately not checking host eligibility in this
    500             # code path.  This is a debug path, not used in production;
    501             # it may be useful to include ineligible hosts here.
    502             modelhosts = []
    503             for model in modellist:
    504                 model_label = constants.Labels.MODEL_PREFIX + model
    505                 host_list = [h for h in afehosts
    506                                   if model_label in h.labels]
    507                 modelhosts.extend(host_list)
    508             afehosts = modelhosts
    509         else:
    510             afehosts = [h for h in afehosts if _eligible_host(h)]
    511         create = lambda host: (
    512                 status_history.HostJobHistory(afe, host,
    513                                               start_time, end_time))
    514         return cls([create(host) for host in afehosts], target_pools)
    515 
    516     def __init__(self, histories, pools):
    517         models = {h.host_model for h in histories}
    518         self._modeldata = {model: _PoolSetInventory(pools) for model in models}
    519         self._dut_count = len(histories)
    520         for h in histories:
    521             self[h.host_model].record_host(h)
    522         self._boards = {h.host_board for h in histories}
    523 
    524     def __getitem__(self, key):
    525         return self._modeldata.__getitem__(key)
    526 
    527     def __len__(self):
    528         return self._modeldata.__len__()
    529 
    530     def __iter__(self):
    531         return self._modeldata.__iter__()
    532 
    533     def get_num_duts(self):
    534         """Return the total number of DUTs in the inventory."""
    535         return self._dut_count
    536 
    537     def get_num_models(self):
    538         """Return the total number of models in the inventory."""
    539         return len(self)
    540 
    541     def get_pool_models(self, pool):
    542         """Return all models in `pool`.
    543 
    544         @param pool The pool to be inventoried for models.
    545         """
    546         return {m for m, h in self.iteritems() if h.get_total(pool)}
    547 
    548     def get_boards(self):
    549         return self._boards
    550 
    551 
    552 def _reportable_models(inventory, spare_pool=SPARE_POOL):
    553     """Iterate over all models subject to reporting.
    554 
    555     Yields the contents of `inventory.iteritems()` filtered to include
    556     only reportable models.  A model is reportable if it has DUTs in
    557     both `spare_pool` and at least one other pool.
    558 
    559     @param spare_pool  The spare pool to be tested for reporting.
    560     """
    561     for model, poolset in inventory.iteritems():
    562         spares = poolset.get_total(spare_pool)
    563         total = poolset.get_total()
    564         if spares != 0 and spares != total:
    565             yield model, poolset
    566 
    567 
    568 def _all_dut_histories(inventory):
    569     for poolset in inventory.itervalues():
    570         for h in poolset.get_all_histories():
    571             yield h
    572 
    573 
    574 def _sort_by_location(inventory_list):
    575     """Return a list of DUTs, organized by location.
    576 
    577     Take the given list of `HostJobHistory` objects, separate it
    578     into a list per lab, and sort each lab's list by location.  The
    579     order of sorting within a lab is
    580       * By row number within the lab,
    581       * then by rack number within the row,
    582       * then by host shelf number within the rack.
    583 
    584     Return a list of the sorted lists.
    585 
    586     Implementation note: host locations are sorted by converting
    587     each location into a base 100 number.  If row, rack or
    588     host numbers exceed the range [0..99], then sorting will
    589     break down.
    590 
    591     @return A list of sorted lists of DUTs.
    592     """
    593     BASE = 100
    594     lab_lists = {}
    595     for history in inventory_list:
    596         location = _HOSTNAME_PATTERN.match(history.host.hostname)
    597         if location:
    598             lab = location.group(1)
    599             key = 0
    600             for idx in location.group(2, 3, 4):
    601                 key = BASE * key + int(idx)
    602             lab_lists.setdefault(lab, []).append((key, history))
    603     return_list = []
    604     for dut_list in lab_lists.values():
    605         dut_list.sort(key=lambda t: t[0])
    606         return_list.append([t[1] for t in dut_list])
    607     return return_list
    608 
    609 
    610 def _score_repair_set(buffer_counts, repair_list):
    611     """Return a numeric score rating a set of DUTs to be repaired.
    612 
    613     `buffer_counts` is a dictionary mapping model names to the size of
    614     the model's spares buffer.
    615 
    616     `repair_list` is a list of `HostJobHistory` objects for the DUTs to
    617     be repaired.
    618 
    619     This function calculates the new set of buffer counts that would
    620     result from the proposed repairs, and scores the new set using two
    621     numbers:
    622       * Worst case buffer count for any model (higher is better).  This
    623         is the more significant number for comparison.
    624       * Number of models at the worst case (lower is better).  This is
    625         the less significant number.
    626 
    627     Implementation note:  The score could fail to reflect the intended
    628     criteria if there are more than 1000 models in the inventory.
    629 
    630     @param spare_counts   A dictionary mapping models to buffer counts.
    631     @param repair_list    A list of `HostJobHistory` objects for the
    632                           DUTs to be repaired.
    633     @return A numeric score.
    634     """
    635     # Go through `buffer_counts`, and create a list of new counts
    636     # that records the buffer count for each model after repair.
    637     # The new list of counts discards the model names, as they don't
    638     # contribute to the final score.
    639     _NMODELS = 1000
    640     pools = {h.host_pool for h in repair_list}
    641     repair_inventory = _LabInventory(repair_list, pools)
    642     new_counts = []
    643     for m, c in buffer_counts.iteritems():
    644         if m in repair_inventory:
    645             newcount = repair_inventory[m].get_total()
    646         else:
    647             newcount = 0
    648         new_counts.append(c + newcount)
    649     # Go through the new list of counts.  Find the worst available
    650     # spares count, and count how many times that worst case occurs.
    651     worst_count = new_counts[0]
    652     num_worst = 1
    653     for c in new_counts[1:]:
    654         if c == worst_count:
    655             num_worst += 1
    656         elif c < worst_count:
    657             worst_count = c
    658             num_worst = 1
    659     # Return the calculated score
    660     return _NMODELS * worst_count - num_worst
    661 
    662 
    663 def _generate_repair_recommendation(inventory, num_recommend):
    664     """Return a summary of selected DUTs needing repair.
    665 
    666     Returns a message recommending a list of broken DUTs to be repaired.
    667     The list of DUTs is selected based on these criteria:
    668       * No more than `num_recommend` DUTs will be listed.
    669       * All DUTs must be in the same lab.
    670       * DUTs should be selected for some degree of physical proximity.
    671       * DUTs for models with a low spares buffer are more important than
    672         DUTs with larger buffers.
    673 
    674     The algorithm used will guarantee that at least one DUT from a model
    675     with the lowest spares buffer will be recommended.  If the worst
    676     spares buffer number is shared by more than one model, the algorithm
    677     will tend to prefer repair sets that include more of those models
    678     over sets that cover fewer models.
    679 
    680     @param inventory      `_LabInventory` object from which to generate
    681                           recommendations.
    682     @param num_recommend  Number of DUTs to recommend for repair.
    683     """
    684     logging.debug('Creating DUT repair recommendations')
    685     model_buffer_counts = {}
    686     broken_list = []
    687     for model, counts in _reportable_models(inventory):
    688         logging.debug('Listing failed DUTs for %s', model)
    689         if counts.get_broken() != 0:
    690             model_buffer_counts[model] = counts.get_spares_buffer()
    691             broken_list.extend(counts.get_broken_list())
    692     # N.B. The logic inside this loop may seem complicated, but
    693     # simplification is hard:
    694     #   * Calculating an initial recommendation outside of
    695     #     the loop likely would make things more complicated,
    696     #     not less.
    697     #   * It's necessary to calculate an initial lab slice once per
    698     #     lab _before_ the while loop, in case the number of broken
    699     #     DUTs in a lab is less than `num_recommend`.
    700     recommendation = None
    701     best_score = None
    702     for lab_duts in _sort_by_location(broken_list):
    703         start = 0
    704         end = num_recommend
    705         lab_slice = lab_duts[start : end]
    706         lab_score = _score_repair_set(model_buffer_counts, lab_slice)
    707         while end < len(lab_duts):
    708             start += 1
    709             end += 1
    710             new_slice = lab_duts[start : end]
    711             new_score = _score_repair_set(model_buffer_counts, new_slice)
    712             if new_score > lab_score:
    713                 lab_slice = new_slice
    714                 lab_score = new_score
    715         if recommendation is None or lab_score > best_score:
    716             recommendation = lab_slice
    717             best_score = lab_score
    718     # N.B. The trailing space in `line_fmt` is manadatory:  Without it,
    719     # Gmail will parse the URL wrong.  Don't ask.  If you simply _must_
    720     # know more, go try it yourself...
    721     line_fmt = '%-30s %-16s %-6s\n    %s '
    722     message = ['Repair recommendations:\n',
    723                line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
    724     if recommendation:
    725         for h in recommendation:
    726             servo_name = servo_host.make_servo_hostname(h.host.hostname)
    727             servo_present = utils.host_is_in_lab_zone(servo_name)
    728             event = _get_diagnosis(h).task
    729             line = line_fmt % (
    730                     h.host.hostname, h.host_model,
    731                     'Yes' if servo_present else 'No', event.job_url)
    732             message.append(line)
    733     else:
    734         message.append('(No DUTs to repair)')
    735     return '\n'.join(message)
    736 
    737 
    738 def _generate_model_inventory_message(inventory):
    739     """Generate the "model inventory" e-mail message.
    740 
    741     The model inventory is a list by model summarizing the number of
    742     working, broken, and idle DUTs, and the total shortfall or surplus
    743     of working devices relative to the minimum critical pool
    744     requirement.
    745 
    746     The report omits models with no DUTs in the spare pool or with no
    747     DUTs in a critical pool.
    748 
    749     N.B. For sample output text formattted as users can expect to
    750     see it in e-mail and log files, refer to the unit tests.
    751 
    752     @param inventory  `_LabInventory` object to be reported on.
    753     @return String with the inventory message to be sent.
    754     """
    755     logging.debug('Creating model inventory')
    756     nworking = 0
    757     nbroken = 0
    758     nidle = 0
    759     nbroken_models = 0
    760     ntotal_models = 0
    761     summaries = []
    762     column_names = (
    763         'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
    764     for model, counts in _reportable_models(inventory):
    765         logging.debug('Counting %2d DUTS for model %s',
    766                       counts.get_total(), model)
    767         # Summary elements laid out in the same order as the column
    768         # headers:
    769         #     Model Avail   Bad  Idle  Good  Spare Total
    770         #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]  e[6]
    771         element = (model,
    772                    counts.get_spares_buffer(),
    773                    counts.get_broken(),
    774                    counts.get_idle(),
    775                    counts.get_working(),
    776                    counts.get_total(SPARE_POOL),
    777                    counts.get_total())
    778         if element[2]:
    779             summaries.append(element)
    780             nbroken_models += 1
    781         ntotal_models += 1
    782         nbroken += element[2]
    783         nidle += element[3]
    784         nworking += element[4]
    785     ntotal = nworking + nbroken + nidle
    786     summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
    787     broken_percent = int(round(100.0 * nbroken / ntotal))
    788     idle_percent = int(round(100.0 * nidle / ntotal))
    789     working_percent = 100 - broken_percent - idle_percent
    790     message = ['Summary of DUTs in inventory:',
    791                '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
    792                '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
    793                    nbroken, broken_percent,
    794                    nidle, idle_percent,
    795                    nworking, working_percent,
    796                    ntotal),
    797                '',
    798                'Models with failures: %d' % nbroken_models,
    799                'Models in inventory:  %d' % ntotal_models,
    800                '', '',
    801                'Full model inventory:\n',
    802                '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
    803     message.extend(
    804             ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
    805     return '\n'.join(message)
    806 
    807 
    808 _POOL_INVENTORY_HEADER = '''\
    809 Notice to Infrastructure deputies:  All models shown below are at
    810 less than full strength, please take action to resolve the issues.
    811 Once you're satisified that failures won't recur, failed DUTs can
    812 be replaced with spares by running `balance_pool`.  Detailed
    813 instructions can be found here:
    814     http://go/cros-manage-duts
    815 '''
    816 
    817 
    818 def _generate_pool_inventory_message(inventory):
    819     """Generate the "pool inventory" e-mail message.
    820 
    821     The pool inventory is a list by pool and model summarizing the
    822     number of working and broken DUTs in the pool.  Only models with
    823     at least one broken DUT are included in the list.
    824 
    825     N.B. For sample output text formattted as users can expect to see it
    826     in e-mail and log files, refer to the unit tests.
    827 
    828     @param inventory  `_LabInventory` object to be reported on.
    829     @return String with the inventory message to be sent.
    830     """
    831     logging.debug('Creating pool inventory')
    832     message = [_POOL_INVENTORY_HEADER]
    833     newline = ''
    834     for pool in CRITICAL_POOLS:
    835         message.append(
    836             '%sStatus for pool:%s, by model:' % (newline, pool))
    837         message.append(
    838             '%-20s   %5s %5s %5s %5s' % (
    839                 'Model', 'Bad', 'Idle', 'Good', 'Total'))
    840         data_list = []
    841         for model, counts in inventory.iteritems():
    842             logging.debug('Counting %2d DUTs for %s, %s',
    843                           counts.get_total(pool), model, pool)
    844             broken = counts.get_broken(pool)
    845             idle = counts.get_idle(pool)
    846             # models at full strength are not reported
    847             if not broken and not idle:
    848                 continue
    849             working = counts.get_working(pool)
    850             total = counts.get_total(pool)
    851             data_list.append((model, broken, idle, working, total))
    852         if data_list:
    853             data_list = sorted(data_list, key=lambda d: -d[1])
    854             message.extend(
    855                 ['%-20s   %5d %5d %5d %5d' % t for t in data_list])
    856         else:
    857             message.append('(All models at full strength)')
    858         newline = '\n'
    859     return '\n'.join(message)
    860 
    861 
    862 _IDLE_INVENTORY_HEADER = '''\
    863 Notice to Infrastructure deputies:  The hosts shown below haven't
    864 run any jobs for at least 24 hours. Please check each host; locked
    865 hosts should normally be unlocked; stuck jobs should normally be
    866 aborted.
    867 '''
    868 
    869 
    870 def _generate_idle_inventory_message(inventory):
    871     """Generate the "idle inventory" e-mail message.
    872 
    873     The idle inventory is a host list with corresponding pool and model,
    874     where the hosts are identified as idle.
    875 
    876     N.B. For sample output text format as users can expect to
    877     see it in e-mail and log files, refer to the unit tests.
    878 
    879     @param inventory  `_LabInventory` object to be reported on.
    880     @return String with the inventory message to be sent.
    881     """
    882     logging.debug('Creating idle inventory')
    883     message = [_IDLE_INVENTORY_HEADER]
    884     message.append('Idle Host List:')
    885     message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
    886     data_list = []
    887     for pool in MANAGED_POOLS:
    888         for model, counts in inventory.iteritems():
    889             logging.debug('Counting %2d DUTs for %s, %s',
    890                           counts.get_total(pool), model, pool)
    891             data_list.extend([(dut.host.hostname, model, pool)
    892                                   for dut in counts.get_idle_list(pool)])
    893     if data_list:
    894         message.extend(['%-30s %-20s %s' % t for t in data_list])
    895     else:
    896         message.append('(No idle DUTs)')
    897     return '\n'.join(message)
    898 
    899 
    900 def _send_email(arguments, tag, subject, recipients, body):
    901     """Send an inventory e-mail message.
    902 
    903     The message is logged in the selected log directory using `tag` for
    904     the file name.
    905 
    906     If the --debug option was requested, the message is neither logged
    907     nor sent, but merely printed on stdout.
    908 
    909     @param arguments   Parsed command-line options.
    910     @param tag         Tag identifying the inventory for logging
    911                        purposes.
    912     @param subject     E-mail Subject: header line.
    913     @param recipients  E-mail addresses for the To: header line.
    914     @param body        E-mail message body.
    915     """
    916     logging.debug('Generating email: "%s"', subject)
    917     all_recipients = ', '.join(recipients)
    918     report_body = '\n'.join([
    919             'To: %s' % all_recipients,
    920             'Subject: %s' % subject,
    921             '', body, ''])
    922     if arguments.debug:
    923         print report_body
    924     else:
    925         filename = os.path.join(arguments.logdir, tag)
    926         try:
    927             report_file = open(filename, 'w')
    928             report_file.write(report_body)
    929             report_file.close()
    930         except EnvironmentError as e:
    931             logging.error('Failed to write %s:  %s', filename, e)
    932         try:
    933             gmail_lib.send_email(all_recipients, subject, body)
    934         except Exception as e:
    935             logging.error('Failed to send e-mail to %s:  %s',
    936                           all_recipients, e)
    937 
    938 
    939 def _populate_model_counts(inventory):
    940     """Gather model counts while providing interactive feedback.
    941 
    942     Gathering the status of all individual DUTs in the lab can take
    943     considerable time (~30 minutes at the time of this writing).
    944     Normally, we pay that cost by querying as we go.  However, with
    945     the `--debug` option, we expect a human being to be watching the
    946     progress in real time.  So, we force the first (expensive) queries
    947     to happen up front, and provide simple ASCII output on sys.stdout
    948     to show a progress bar and results.
    949 
    950     @param inventory  `_LabInventory` object from which to gather
    951                       counts.
    952     """
    953     n = 0
    954     total_broken = 0
    955     for counts in inventory.itervalues():
    956         n += 1
    957         if n % 10 == 5:
    958             c = '+'
    959         elif n % 10 == 0:
    960             c = '%d' % ((n / 10) % 10)
    961         else:
    962             c = '.'
    963         sys.stdout.write(c)
    964         sys.stdout.flush()
    965         # This next call is where all the time goes - it forces all of a
    966         # model's `HostJobHistory` objects to query the database and
    967         # cache their results.
    968         total_broken += counts.get_broken()
    969     sys.stdout.write('\n')
    970     sys.stdout.write('Found %d broken DUTs\n' % total_broken)
    971 
    972 
    973 def _perform_model_inventory(arguments, inventory, timestamp):
    974     """Perform the model inventory report.
    975 
    976     The model inventory report consists of the following:
    977       * A list of DUTs that are recommended to be repaired.  This list
    978         is optional, and only appears if the `--recommend` option is
    979         present.
    980       * A list of all models that have failed DUTs, with counts
    981         of working, broken, and spare DUTs, among others.
    982 
    983     @param arguments  Command-line arguments as returned by
    984                       `ArgumentParser`
    985     @param inventory  `_LabInventory` object to be reported on.
    986     @param timestamp  A string used to identify this run's timestamp
    987                       in logs and email output.
    988     """
    989     if arguments.recommend:
    990         recommend_message = _generate_repair_recommendation(
    991                 inventory, arguments.recommend) + '\n\n\n'
    992     else:
    993         recommend_message = ''
    994     model_message = _generate_model_inventory_message(inventory)
    995     _send_email(arguments,
    996                 'models-%s.txt' % timestamp,
    997                 'DUT model inventory %s' % timestamp,
    998                 arguments.model_notify,
    999                 recommend_message + model_message)
   1000 
   1001 
   1002 def _perform_pool_inventory(arguments, inventory, timestamp):
   1003     """Perform the pool inventory report.
   1004 
   1005     The pool inventory report consists of the following:
   1006       * A list of all critical pools that have failed DUTs, with counts
   1007         of working, broken, and idle DUTs.
   1008       * A list of all idle DUTs by hostname including the model and
   1009         pool.
   1010 
   1011     @param arguments  Command-line arguments as returned by
   1012                       `ArgumentParser`
   1013     @param inventory  `_LabInventory` object to be reported on.
   1014     @param timestamp  A string used to identify this run's timestamp in
   1015                       logs and email output.
   1016     """
   1017     pool_message = _generate_pool_inventory_message(inventory)
   1018     idle_message = _generate_idle_inventory_message(inventory)
   1019     _send_email(arguments,
   1020                 'pools-%s.txt' % timestamp,
   1021                 'DUT pool inventory %s' % timestamp,
   1022                 arguments.pool_notify,
   1023                 pool_message + '\n\n\n' + idle_message)
   1024 
   1025 
   1026 def _dut_in_repair_loop(history):
   1027     """Return whether a DUT's history indicates a repair loop.
   1028 
   1029     A DUT is considered looping if it runs no tests, and no tasks pass
   1030     other than repair tasks.
   1031 
   1032     @param history  An instance of `status_history.HostJobHistory` to be
   1033                     scanned for a repair loop.  The caller guarantees
   1034                     that this history corresponds to a working DUT.
   1035     @returns  Return a true value if the DUT's most recent history
   1036               indicates a repair loop.
   1037     """
   1038     # Our caller passes only histories for working DUTs; that means
   1039     # we've already paid the cost of fetching the diagnosis task, and
   1040     # we know that the task was successful.  The diagnosis task will be
   1041     # one of the tasks we must scan to find a loop, so if the task isn't
   1042     # a repair task, then our history includes a successful non-repair
   1043     # task, and we're not looping.
   1044     #
   1045     # The for loop below is very expensive, because it must fetch the
   1046     # full history, regardless of how many tasks we examine.  At the
   1047     # time of this writing, this check against the diagnosis task
   1048     # reduces the cost of finding loops in the full inventory from hours
   1049     # to minutes.
   1050     if _get_diagnosis(history).task.name != 'Repair':
   1051         return False
   1052     repair_ok_count = 0
   1053     for task in history:
   1054         if not task.is_special:
   1055             # This is a test, so we're not looping.
   1056             return False
   1057         if task.diagnosis == status_history.BROKEN:
   1058             # Failed a repair, so we're not looping.
   1059             return False
   1060         if (task.diagnosis == status_history.WORKING
   1061                 and task.name != 'Repair'):
   1062             # Non-repair task succeeded, so we're not looping.
   1063             return False
   1064         # At this point, we have either a failed non-repair task, or
   1065         # a successful repair.
   1066         if task.name == 'Repair':
   1067             repair_ok_count += 1
   1068             if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
   1069                 return True
   1070 
   1071 
   1072 def _report_untestable_dut(history, state):
   1073     fields = {
   1074         'dut_hostname': history.hostname,
   1075         'model': history.host_model,
   1076         'pool': history.host_pool,
   1077         'state': state,
   1078     }
   1079     logging.info('DUT in state %(state)s: %(dut_hostname)s, '
   1080                  'model: %(model)s, pool: %(pool)s', fields)
   1081     _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
   1082 
   1083 
   1084 def _report_untestable_dut_metrics(inventory):
   1085     """Scan the inventory for DUTs unable to run tests.
   1086 
   1087     DUTs in the inventory are judged "untestable" if they meet one of
   1088     two criteria:
   1089       * The DUT is stuck in a repair loop; that is, it regularly passes
   1090         repair, but never passes other operations.
   1091       * The DUT runs no tasks at all, but is not locked.
   1092 
   1093     This routine walks through the given inventory looking for DUTs in
   1094     either of these states.  Results are reported via a Monarch presence
   1095     metric.
   1096 
   1097     Note:  To make sure that DUTs aren't flagged as "idle" merely
   1098     because there's no work, a separate job runs prior to regular
   1099     inventory runs which schedules trivial work on any DUT that appears
   1100     idle.
   1101 
   1102     @param inventory  `_LabInventory` object to be reported on.
   1103     """
   1104     logging.info('Scanning for untestable DUTs.')
   1105     for history in _all_dut_histories(inventory):
   1106         # Managed DUTs with names that don't match
   1107         # _HOSTNAME_PATTERN shouldn't be possible.  However, we
   1108         # don't want arbitrary strings being attached to the
   1109         # 'dut_hostname' field, so for safety, we exclude all
   1110         # anomalies.
   1111         if not _HOSTNAME_PATTERN.match(history.hostname):
   1112             continue
   1113         if _host_is_working(history):
   1114             if _dut_in_repair_loop(history):
   1115                 _report_untestable_dut(history, 'repair_loop')
   1116         elif _host_is_idle(history):
   1117             if not history.host.locked:
   1118                 _report_untestable_dut(history, 'idle_unlocked')
   1119 
   1120 
   1121 def _log_startup(arguments, startup_time):
   1122     """Log the start of this inventory run.
   1123 
   1124     Print various log messages indicating the start of the run.  Return
   1125     a string based on `startup_time` that will be used to identify this
   1126     run in log files and e-mail messages.
   1127 
   1128     @param startup_time   A UNIX timestamp marking the moment when
   1129                           this inventory run began.
   1130     @returns  A timestamp string that will be used to identify this run
   1131               in logs and email output.
   1132     """
   1133     timestamp = time.strftime('%Y-%m-%d.%H',
   1134                               time.localtime(startup_time))
   1135     logging.debug('Starting lab inventory for %s', timestamp)
   1136     if arguments.model_notify:
   1137         if arguments.recommend:
   1138             logging.debug('Will include repair recommendations')
   1139         logging.debug('Will include model inventory')
   1140     if arguments.pool_notify:
   1141         logging.debug('Will include pool inventory')
   1142     return timestamp
   1143 
   1144 
   1145 def _create_inventory(arguments, end_time):
   1146     """Create the `_LabInventory` instance to use for reporting.
   1147 
   1148     @param end_time   A UNIX timestamp for the end of the time range
   1149                       to be searched in this inventory run.
   1150     """
   1151     start_time = end_time - arguments.duration * 60 * 60
   1152     afe = frontend_wrappers.RetryingAFE(server=None)
   1153     inventory = _LabInventory.create_inventory(
   1154             afe, start_time, end_time, arguments.modelnames)
   1155     logging.info('Found %d hosts across %d models',
   1156                      inventory.get_num_duts(),
   1157                      inventory.get_num_models())
   1158     return inventory
   1159 
   1160 
   1161 def _perform_inventory_reports(arguments):
   1162     """Perform all inventory checks requested on the command line.
   1163 
   1164     Create the initial inventory and run through the inventory reports
   1165     as called for by the parsed command-line arguments.
   1166 
   1167     @param arguments  Command-line arguments as returned by
   1168                       `ArgumentParser`.
   1169     """
   1170     startup_time = time.time()
   1171     timestamp = _log_startup(arguments, startup_time)
   1172     inventory = _create_inventory(arguments, startup_time)
   1173     if arguments.debug:
   1174         _populate_model_counts(inventory)
   1175     if arguments.model_notify:
   1176         _perform_model_inventory(arguments, inventory, timestamp)
   1177     if arguments.pool_notify:
   1178         _perform_pool_inventory(arguments, inventory, timestamp)
   1179     if arguments.report_untestable:
   1180         _report_untestable_dut_metrics(inventory)
   1181 
   1182 
   1183 def _separate_email_addresses(address_list):
   1184     """Parse a list of comma-separated lists of e-mail addresses.
   1185 
   1186     @param address_list  A list of strings containing comma
   1187                          separate e-mail addresses.
   1188     @return A list of the individual e-mail addresses.
   1189     """
   1190     newlist = []
   1191     for arg in address_list:
   1192         newlist.extend([email.strip() for email in arg.split(',')])
   1193     return newlist
   1194 
   1195 
   1196 def _verify_arguments(arguments):
   1197     """Validate command-line arguments.
   1198 
   1199     Join comma separated e-mail addresses for `--model-notify` and
   1200     `--pool-notify` in separate option arguments into a single list.
   1201 
   1202     For non-debug uses, require that at least one inventory report be
   1203     requested.  For debug, if a report isn't specified, treat it as "run
   1204     all the reports."
   1205 
   1206     The return value indicates success or failure; in the case of
   1207     failure, we also write an error message to stderr.
   1208 
   1209     @param arguments  Command-line arguments as returned by
   1210                       `ArgumentParser`
   1211     @return True if the arguments are semantically good, or False
   1212             if the arguments don't meet requirements.
   1213     """
   1214     arguments.model_notify = _separate_email_addresses(
   1215             arguments.model_notify)
   1216     arguments.pool_notify = _separate_email_addresses(
   1217             arguments.pool_notify)
   1218     if not any([arguments.model_notify, arguments.pool_notify,
   1219                 arguments.report_untestable]):
   1220         if not arguments.debug:
   1221             sys.stderr.write('Must request at least one report via '
   1222                              '--model-notify, --pool-notify, or '
   1223                              '--report-untestable\n')
   1224             return False
   1225         else:
   1226             # We want to run all the e-mail reports.  An empty notify
   1227             # list will cause a report to be skipped, so make sure the
   1228             # lists are non-empty.
   1229             arguments.model_notify = ['']
   1230             arguments.pool_notify = ['']
   1231     return True
   1232 
   1233 
   1234 def _get_default_logdir(script):
   1235     """Get the default directory for the `--logdir` option.
   1236 
   1237     The default log directory is based on the parent directory
   1238     containing this script.
   1239 
   1240     @param script  Path to this script file.
   1241     @return A path to a directory.
   1242     """
   1243     basedir = os.path.dirname(os.path.abspath(script))
   1244     basedir = os.path.dirname(basedir)
   1245     return os.path.join(basedir, _LOGDIR)
   1246 
   1247 
   1248 def _parse_command(argv):
   1249     """Parse the command line arguments.
   1250 
   1251     Create an argument parser for this command's syntax, parse the
   1252     command line, and return the result of the ArgumentParser
   1253     parse_args() method.
   1254 
   1255     @param argv Standard command line argument vector; argv[0] is
   1256                 assumed to be the command name.
   1257     @return Result returned by ArgumentParser.parse_args().
   1258     """
   1259     parser = argparse.ArgumentParser(
   1260             prog=argv[0],
   1261             description='Gather and report lab inventory statistics')
   1262     parser.add_argument('-d', '--duration', type=int,
   1263                         default=_DEFAULT_DURATION, metavar='HOURS',
   1264                         help='number of hours back to search for status'
   1265                              ' (default: %d)' % _DEFAULT_DURATION)
   1266     parser.add_argument('--model-notify', action='append',
   1267                         default=[], metavar='ADDRESS',
   1268                         help='Generate model inventory message, '
   1269                         'and send it to the given e-mail address(es)')
   1270     parser.add_argument('--pool-notify', action='append',
   1271                         default=[], metavar='ADDRESS',
   1272                         help='Generate pool inventory message, '
   1273                              'and send it to the given address(es)')
   1274     parser.add_argument('-r', '--recommend', type=int, default=None,
   1275                         help=('Specify how many DUTs should be '
   1276                               'recommended for repair (default: no '
   1277                               'recommendation)'))
   1278     parser.add_argument('--report-untestable', action='store_true',
   1279                         help='Check for devices unable to run tests.')
   1280     parser.add_argument('--debug', action='store_true',
   1281                         help='Print e-mail, metrics messages on stdout '
   1282                              'without sending them.')
   1283     parser.add_argument('--no-metrics', action='store_false',
   1284                         dest='use_metrics',
   1285                         help='Suppress generation of Monarch metrics.')
   1286     parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
   1287                         help='Directory where logs will be written.')
   1288     parser.add_argument('modelnames', nargs='*',
   1289                         metavar='MODEL',
   1290                         help='names of models to report on '
   1291                              '(default: all models)')
   1292     arguments = parser.parse_args(argv[1:])
   1293     if not _verify_arguments(arguments):
   1294         return None
   1295     return arguments
   1296 
   1297 
   1298 def _configure_logging(arguments):
   1299     """Configure the `logging` module for our needs.
   1300 
   1301     How we log depends on whether the `--debug` option was provided on
   1302     the command line.
   1303       * Without the option, we configure the logging to capture all
   1304         potentially relevant events in a log file.  The log file is
   1305         configured to rotate once a week on Friday evening, preserving
   1306         ~3 months worth of history.
   1307       * With the option, we expect stdout to contain other
   1308         human-readable output (including the contents of the e-mail
   1309         messages), so we restrict the output to INFO level.
   1310 
   1311     For convenience, when `--debug` is on, the logging format has
   1312     no adornments, so that a call like `logging.info(msg)` simply writes
   1313     `msg` to stdout, plus a trailing newline.
   1314 
   1315     @param arguments  Command-line arguments as returned by
   1316                       `ArgumentParser`
   1317     """
   1318     root_logger = logging.getLogger()
   1319     if arguments.debug:
   1320         root_logger.setLevel(logging.INFO)
   1321         handler = logging.StreamHandler(sys.stdout)
   1322         handler.setFormatter(logging.Formatter())
   1323     else:
   1324         if not os.path.exists(arguments.logdir):
   1325             os.mkdir(arguments.logdir)
   1326         root_logger.setLevel(logging.DEBUG)
   1327         logfile = os.path.join(arguments.logdir, _LOGFILE)
   1328         handler = logging.handlers.TimedRotatingFileHandler(
   1329                 logfile, when='W4', backupCount=13)
   1330         formatter = logging.Formatter(_LOG_FORMAT,
   1331                                       time_utils.TIME_FMT)
   1332         handler.setFormatter(formatter)
   1333     # TODO(jrbarnette) This is gross.  Importing client.bin.utils
   1334     # implicitly imported logging_config, which calls
   1335     # logging.basicConfig() *at module level*.  That gives us an
   1336     # extra logging handler that we don't want.  So, clear out all
   1337     # the handlers here.
   1338     for h in root_logger.handlers:
   1339         root_logger.removeHandler(h)
   1340     root_logger.addHandler(handler)
   1341 
   1342 
   1343 def main(argv):
   1344     """Standard main routine.
   1345 
   1346     @param argv  Command line arguments, including `sys.argv[0]`.
   1347     """
   1348     arguments = _parse_command(argv)
   1349     if not arguments:
   1350         sys.exit(1)
   1351     _configure_logging(arguments)
   1352 
   1353     try:
   1354         if arguments.use_metrics:
   1355             if arguments.debug:
   1356                 logging.info('Debug mode: Will not report metrics to monarch.')
   1357                 metrics_file = '/dev/null'
   1358             else:
   1359                 metrics_file = None
   1360             with site_utils.SetupTsMonGlobalState(
   1361                     'lab_inventory', debug_file=metrics_file,
   1362                     auto_flush=False):
   1363                 success = False
   1364                 try:
   1365                     with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
   1366                         _perform_inventory_reports(arguments)
   1367                     success = True
   1368                 finally:
   1369                     metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
   1370                             fields={'success': success})
   1371                     metrics.Flush()
   1372         else:
   1373             _perform_inventory_reports(arguments)
   1374     except KeyboardInterrupt:
   1375         pass
   1376     except Exception:
   1377         # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
   1378         logging.exception('Error escaped main')
   1379         raise
   1380 
   1381 
   1382 def get_inventory(afe):
   1383     end_time = int(time.time())
   1384     start_time = end_time - 24 * 60 * 60
   1385     return _LabInventory.create_inventory(afe, start_time, end_time)
   1386 
   1387 
   1388 def get_managed_boards(afe):
   1389     return get_inventory(afe).get_boards()
   1390 
   1391 
   1392 if __name__ == '__main__':
   1393     main(sys.argv)
   1394