Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Create e-mail reports of the Lab's DUT inventory.
      7 
      8 Gathers a list of all DUTs of interest in the Lab, segregated by
      9 board and pool, and determines whether each DUT is working or
     10 broken.  Then, send one or more e-mail reports summarizing the
     11 status to e-mail addresses provided on the command line.
     12 
     13 usage:  lab_inventory.py [ options ] [ board ... ]
     14 
     15 Options:
     16 --duration / -d <hours>
     17     How far back in time to search job history to determine DUT
     18     status.
     19 
     20 --board-notify <address>[,<address>]
     21     Send the "board status" e-mail to all the specified e-mail
     22     addresses.
     23 
     24 --pool-notify <address>[,<address>]
     25     Send the "pool status" e-mail to all the specified e-mail
     26     addresses.
     27 
     28 --recommend <number>
     29     When generating the "board status" e-mail, included a list of
     30     <number> specific DUTs to be recommended for repair.
     31 
     32 --logdir <directory>
     33     Log progress and actions in a file under this directory.  Text
     34     of any e-mail sent will also be logged in a timestamped file in
     35     this directory.
     36 
     37 --debug
     38     Suppress all logging and sending e-mail.  Instead, write the
     39     output that would be generated onto stdout.
     40 
     41 <board> arguments:
     42     With no arguments, gathers the status for all boards in the lab.
     43     With one or more named boards on the command line, restricts
     44     reporting to just those boards.
     45 
     46 """
     47 
     48 
     49 import argparse
     50 import logging
     51 import logging.handlers
     52 import os
     53 import re
     54 import sys
     55 import time
     56 
     57 import common
     58 from autotest_lib.client.bin import utils
     59 from autotest_lib.client.common_lib import time_utils
     60 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     61 from autotest_lib.server.hosts import servo_host
     62 from autotest_lib.server.lib import status_history
     63 from autotest_lib.site_utils import gmail_lib
     64 from autotest_lib.site_utils.suite_scheduler import constants
     65 
     66 
     67 CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
     68 SPARE_POOL = constants.Pools.SPARE_POOL
     69 MANAGED_POOLS = constants.Pools.MANAGED_POOLS
     70 
     71 # _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
     72 #     monitoring by this script.  Currently, we're excluding any
     73 #     'adb' host, because we're not ready to monitor Android or
     74 #     Brillo hosts.
     75 
     76 _EXCLUDED_LABELS = set(['adb'])
     77 
     78 # _DEFAULT_DURATION:
     79 #     Default value used for the --duration command line option.
     80 #     Specifies how far back in time to search in order to determine
     81 #     DUT status.
     82 
     83 _DEFAULT_DURATION = 24
     84 
     85 # _LOGDIR:
     86 #     Relative path used in the calculation of the default setting
     87 #     for the --logdir option.  The full path path is relative to
     88 #     the root of the autotest directory, as determined from
     89 #     sys.argv[0].
     90 # _LOGFILE:
     91 #     Basename of a file to which general log information will be
     92 #     written.
     93 # _LOG_FORMAT:
     94 #     Format string for log messages.
     95 
     96 _LOGDIR = os.path.join('logs', 'dut-data')
     97 _LOGFILE = 'lab-inventory.log'
     98 _LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
     99 
    100 # Pattern describing location-based host names in the Chrome OS test
    101 # labs.  Each DUT hostname designates the DUT's location:
    102 #   * A lab (room) that's physically separated from other labs
    103 #     (i.e. there's a door).
    104 #   * A row (or aisle) of DUTs within the lab.
    105 #   * A vertical rack of shelves on the row.
    106 #   * A specific host on one shelf of the rack.
    107 
    108 _HOSTNAME_PATTERN = re.compile(
    109         r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
    110 
    111 # Default entry for managed pools.
    112 
    113 _MANAGED_POOL_DEFAULT = 'all_pools'
    114 
    115 
    116 class _PoolCounts(object):
    117     """Maintains a set of `HostJobHistory` objects for a pool.
    118 
    119     The collected history objects are nominally all part of a single
    120     scheduling pool of DUTs.  The collection maintains a list of
    121     working DUTs, a list of broken DUTs, and a list of all DUTs.
    122 
    123     Performance note:  Certain methods in this class are potentially
    124     expensive:
    125       * `get_working()`
    126       * `get_working_list()`
    127       * `get_broken()`
    128       * `get_broken_list()`
    129       * `get_idle()`
    130       * `get_idle_list()`
    131     The first time any one of these methods is called, it causes
    132     multiple RPC calls with a relatively expensive set of database
    133     queries.  However, the results of the queries are cached in the
    134     individual `HostJobHistory` objects, so only the first call
    135     actually pays the full cost.
    136 
    137     Additionally, `get_working_list()`, `get_broken_list()` and
    138     `get_idle_list()` cache their return values to avoid recalculating
    139     lists at every call; this caching is separate from the caching of RPC
    140     results described above.
    141 
    142     This class is deliberately constructed to delay the RPC cost
    143     until the accessor methods are called (rather than to query in
    144     `record_host()`) so that it's possible to construct a complete
    145     `_LabInventory` without making the expensive queries at creation
    146     time.  `_populate_board_counts()`, below, assumes this behavior.
    147 
    148     """
    149 
    150     def __init__(self):
    151         self._histories = []
    152         self._working_list = None
    153         self._broken_list = None
    154         self._idle_list = None
    155 
    156 
    157     def record_host(self, host_history):
    158         """Add one `HostJobHistory` object to the collection.
    159 
    160         @param host_history The `HostJobHistory` object to be
    161                             remembered.
    162 
    163         """
    164         self._working_list = None
    165         self._broken_list = None
    166         self._idle_list = None
    167         self._histories.append(host_history)
    168 
    169 
    170     def get_working_list(self):
    171         """Return a list of all working DUTs in the pool.
    172 
    173         Filter `self._histories` for histories where the last
    174         diagnosis is `WORKING`.
    175 
    176         Cache the result so that we only cacluate it once.
    177 
    178         @return A list of HostJobHistory objects.
    179 
    180         """
    181         if self._working_list is None:
    182             self._working_list = [h for h in self._histories
    183                     if h.last_diagnosis()[0] == status_history.WORKING]
    184         return self._working_list
    185 
    186 
    187     def get_working(self):
    188         """Return the number of working DUTs in the pool."""
    189         return len(self.get_working_list())
    190 
    191 
    192     def get_broken_list(self):
    193         """Return a list of all broken DUTs in the pool.
    194 
    195         Filter `self._histories` for histories where the last
    196         diagnosis is `BROKEN`.
    197 
    198         Cache the result so that we only cacluate it once.
    199 
    200         @return A list of HostJobHistory objects.
    201 
    202         """
    203         if self._broken_list is None:
    204             self._broken_list = [h for h in self._histories
    205                     if h.last_diagnosis()[0] == status_history.BROKEN]
    206         return self._broken_list
    207 
    208 
    209     def get_broken(self):
    210         """Return the number of broken DUTs in the pool."""
    211         return len(self.get_broken_list())
    212 
    213 
    214     def get_idle_list(self):
    215         """Return a list of all idle DUTs in the pool.
    216 
    217         Filter `self._histories` for histories where the last
    218         diagnosis is `UNUSED` or `UNKNOWN`.
    219 
    220         Cache the result so that we only cacluate it once.
    221 
    222         @return A list of HostJobHistory objects.
    223 
    224         """
    225         idle_list = [status_history.UNUSED, status_history.UNKNOWN]
    226         if self._idle_list is None:
    227             self._idle_list = [h for h in self._histories
    228                     if h.last_diagnosis()[0] in idle_list]
    229         return self._idle_list
    230 
    231 
    232     def get_idle(self):
    233         """Return the number of idle DUTs in the pool."""
    234         return len(self.get_idle_list())
    235 
    236 
    237     def get_total(self):
    238         """Return the total number of DUTs in the pool."""
    239         return len(self._histories)
    240 
    241 
    242 class _BoardCounts(object):
    243     """Maintains a set of `HostJobHistory` objects for a board.
    244 
    245     The collected history objects are nominally all of the same
    246     board.  The collection maintains a count of working DUTs, a
    247     count of broken DUTs, and a total count.  The counts can be
    248     obtained either for a single pool, or as a total across all
    249     pools.
    250 
    251     DUTs in the collection must be assigned to one of the pools
    252     in `_MANAGED_POOLS`.
    253 
    254     The `get_working()` and `get_broken()` methods rely on the
    255     methods of the same name in _PoolCounts, so the performance
    256     note in _PoolCounts applies here as well.
    257 
    258     """
    259 
    260     def __init__(self):
    261         self._pools = {
    262             pool: _PoolCounts() for pool in MANAGED_POOLS
    263         }
    264 
    265     def record_host(self, host_history):
    266         """Add one `HostJobHistory` object to the collection.
    267 
    268         @param host_history The `HostJobHistory` object to be
    269                             remembered.
    270 
    271         """
    272         pool = host_history.host_pool
    273         self._pools[pool].record_host(host_history)
    274 
    275 
    276     def _count_pool(self, get_pool_count, pool=None):
    277         """Internal helper to count hosts in a given pool.
    278 
    279         The `get_pool_count` parameter is a function to calculate
    280         the exact count of interest for the pool.
    281 
    282         @param get_pool_count  Function to return a count from a
    283                                _PoolCount object.
    284         @param pool            The pool to be counted.  If `None`,
    285                                return the total across all pools.
    286 
    287         """
    288         if pool is None:
    289             return sum([get_pool_count(counts)
    290                             for counts in self._pools.values()])
    291         else:
    292             return get_pool_count(self._pools[pool])
    293 
    294 
    295     def get_working_list(self):
    296         """Return a list of all working DUTs for the board.
    297 
    298         Go through all HostJobHistory objects in the board's pools,
    299         selecting the ones where the last diagnosis is `WORKING`.
    300 
    301         @return A list of HostJobHistory objects.
    302 
    303         """
    304         l = []
    305         for p in self._pools.values():
    306             l.extend(p.get_working_list())
    307         return l
    308 
    309 
    310     def get_working(self, pool=None):
    311         """Return the number of working DUTs in a pool.
    312 
    313         @param pool  The pool to be counted.  If `None`, return the
    314                      total across all pools.
    315 
    316         @return The total number of working DUTs in the selected
    317                 pool(s).
    318         """
    319         return self._count_pool(_PoolCounts.get_working, pool)
    320 
    321 
    322     def get_broken_list(self):
    323         """Return a list of all broken DUTs for the board.
    324 
    325         Go through all HostJobHistory objects in the board's pools,
    326         selecting the ones where the last diagnosis is `BROKEN`.
    327 
    328         @return A list of HostJobHistory objects.
    329 
    330         """
    331         l = []
    332         for p in self._pools.values():
    333             l.extend(p.get_broken_list())
    334         return l
    335 
    336 
    337     def get_broken(self, pool=None):
    338         """Return the number of broken DUTs in a pool.
    339 
    340         @param pool  The pool to be counted.  If `None`, return the
    341                      total across all pools.
    342 
    343         @return The total number of broken DUTs in the selected pool(s).
    344         """
    345         return self._count_pool(_PoolCounts.get_broken, pool)
    346 
    347 
    348     def get_idle_list(self, pool=None):
    349         """Return a list of all idle DUTs for the board.
    350 
    351         Go through all HostJobHistory objects in the board's pools,
    352         selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
    353 
    354         @param pool: The pool to be counted. If `None`, return the total list
    355                      across all pools.
    356 
    357         @return A list of HostJobHistory objects.
    358 
    359         """
    360         if pool is None:
    361             l = []
    362             for p in self._pools.values():
    363                 l.extend(p.get_idle_list())
    364             return l
    365         else:
    366             return _PoolCounts.get_idle_list(self._pools[pool])
    367 
    368 
    369     def get_idle(self, pool=None):
    370         """Return the number of idle DUTs in a pool.
    371 
    372         @param pool: The pool to be counted. If `None`, return the total
    373                      across all pools.
    374 
    375         @return The total number of idle DUTs in the selected pool(s).
    376         """
    377         return self._count_pool(_PoolCounts.get_idle, pool)
    378 
    379 
    380     def get_spares_buffer(self):
    381         """Return the the nominal number of working spares.
    382 
    383         Calculates and returns how many working spares there would
    384         be in the spares pool if all broken DUTs were in the spares
    385         pool.  This number may be negative, indicating a shortfall
    386         in the critical pools.
    387 
    388         @return The total number DUTs in the spares pool, less the total
    389                 number of broken DUTs in all pools.
    390         """
    391         return self.get_total(SPARE_POOL) - self.get_broken()
    392 
    393 
    394     def get_total(self, pool=None):
    395         """Return the total number of DUTs in a pool.
    396 
    397         @param pool  The pool to be counted.  If `None`, return the
    398                      total across all pools.
    399 
    400         @return The total number of DUTs in the selected pool(s).
    401         """
    402         return self._count_pool(_PoolCounts.get_total, pool)
    403 
    404 
    405 class _LabInventory(dict):
    406     """Collection of `HostJobHistory` objects for the Lab's inventory.
    407 
    408     The collection is indexed by board.  Indexing returns the
    409     _BoardCounts object associated with the board.
    410 
    411     The collection is also iterable.  The iterator returns all the
    412     boards in the inventory, in unspecified order.
    413 
    414     """
    415 
    416     @staticmethod
    417     def _eligible_host(afehost):
    418         """Return whether this host is eligible for monitoring.
    419 
    420         Hosts with any label that's in `_EXCLUDED_LABELS` aren't
    421         eligible.
    422 
    423         @param afehost  The host to be tested for eligibility.
    424         """
    425         return not len(_EXCLUDED_LABELS.intersection(afehost.labels))
    426 
    427 
    428     @classmethod
    429     def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
    430         """Return a Lab inventory with specified parameters.
    431 
    432         By default, gathers inventory from `HostJobHistory` objects
    433         for all DUTs in the `MANAGED_POOLS` list.  If `boardlist`
    434         is supplied, the inventory will be restricted to only the
    435         given boards.
    436 
    437         @param afe         AFE object for constructing the
    438                            `HostJobHistory` objects.
    439         @param start_time  Start time for the `HostJobHistory`
    440                            objects.
    441         @param end_time    End time for the `HostJobHistory`
    442                            objects.
    443         @param boardlist   List of boards to include.  If empty,
    444                            include all available boards.
    445         @return A `_LabInventory` object for the specified boards.
    446 
    447         """
    448         label_list = [constants.Labels.POOL_PREFIX + l
    449                           for l in MANAGED_POOLS]
    450         afehosts = afe.get_hosts(labels__name__in=label_list)
    451         if boardlist:
    452             # We're deliberately not checking host eligibility in this
    453             # code path.  This is a debug path, not used in production;
    454             # it may be useful to include ineligible hosts here.
    455             boardhosts = []
    456             for board in boardlist:
    457                 board_label = constants.Labels.BOARD_PREFIX + board
    458                 host_list = [h for h in afehosts
    459                                   if board_label in h.labels]
    460                 boardhosts.extend(host_list)
    461             afehosts = boardhosts
    462         else:
    463             afehosts = [h for h in afehosts if cls._eligible_host(h)]
    464         create = lambda host: (
    465                 status_history.HostJobHistory(afe, host,
    466                                               start_time, end_time))
    467         return cls([create(host) for host in afehosts])
    468 
    469 
    470     def __init__(self, histories):
    471         # N.B. The query that finds our hosts is restricted to those
    472         # with a valid pool: label, but doesn't check for a valid
    473         # board: label.  In some (insufficiently) rare cases, the
    474         # AFE hosts table has been known to (incorrectly) have DUTs
    475         # with a pool: but no board: label.  We explicitly exclude
    476         # those here.
    477         histories = [h for h in histories
    478                      if h.host_board is not None]
    479         boards = set([h.host_board for h in histories])
    480         initval = { board: _BoardCounts() for board in boards }
    481         super(_LabInventory, self).__init__(initval)
    482         self._dut_count = len(histories)
    483         self._managed_boards = {}
    484         for h in histories:
    485             self[h.host_board].record_host(h)
    486 
    487 
    488     def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
    489         """Return the set of "managed" boards.
    490 
    491         Operationally, saying a board is "managed" means that the
    492         board will be included in the "board" and "repair
    493         recommendations" reports.  That is, if there are failures in
    494         the board's inventory then lab techs will be asked to fix
    495         them without a separate ticket.
    496 
    497         For purposes of implementation, a board is "managed" if it
    498         has DUTs in both the spare and a non-spare (i.e. critical)
    499         pool.
    500 
    501         @param pool: The specified pool for managed boards.
    502         @return A set of all the boards that have both spare and
    503                 non-spare pools, unless the pool is specified,
    504                 then the set of boards in that pool.
    505         """
    506         if self._managed_boards.get(pool, None) is None:
    507             self._managed_boards[pool] = set()
    508             for board, counts in self.items():
    509                 # Get the counts for all pools, otherwise get it for the
    510                 # specified pool.
    511                 if pool == _MANAGED_POOL_DEFAULT:
    512                     spares = counts.get_total(SPARE_POOL)
    513                     total = counts.get_total()
    514                     if spares != 0 and spares != total:
    515                         self._managed_boards[pool].add(board)
    516                 else:
    517                     if counts.get_total(pool) != 0:
    518                         self._managed_boards[pool].add(board)
    519         return self._managed_boards[pool]
    520 
    521 
    522     def get_num_duts(self):
    523         """Return the total number of DUTs in the inventory."""
    524         return self._dut_count
    525 
    526 
    527     def get_num_boards(self):
    528         """Return the total number of boards in the inventory."""
    529         return len(self)
    530 
    531 
    532 def _sort_by_location(inventory_list):
    533     """Return a list of DUTs, organized by location.
    534 
    535     Take the given list of `HostJobHistory` objects, separate it
    536     into a list per lab, and sort each lab's list by location.  The
    537     order of sorting within a lab is
    538       * By row number within the lab,
    539       * then by rack number within the row,
    540       * then by host shelf number within the rack.
    541 
    542     Return a list of the sorted lists.
    543 
    544     Implementation note: host locations are sorted by converting
    545     each location into a base 100 number.  If row, rack or
    546     host numbers exceed the range [0..99], then sorting will
    547     break down.
    548 
    549     @return A list of sorted lists of DUTs.
    550 
    551     """
    552     BASE = 100
    553     lab_lists = {}
    554     for history in inventory_list:
    555         location = _HOSTNAME_PATTERN.match(history.host.hostname)
    556         if location:
    557             lab = location.group(1)
    558             key = 0
    559             for idx in location.group(2, 3, 4):
    560                 key = BASE * key + int(idx)
    561             lab_lists.setdefault(lab, []).append((key, history))
    562     return_list = []
    563     for dut_list in lab_lists.values():
    564         dut_list.sort(key=lambda t: t[0])
    565         return_list.append([t[1] for t in dut_list])
    566     return return_list
    567 
    568 
    569 def _score_repair_set(buffer_counts, repair_list):
    570     """Return a numeric score rating a set of DUTs to be repaired.
    571 
    572     `buffer_counts` is a dictionary mapping board names to the
    573     size of the board's spares buffer.
    574 
    575     `repair_list` is a list of DUTs to be repaired.
    576 
    577     This function calculates the new set of buffer counts that would
    578     result from the proposed repairs, and scores the new set using
    579     two numbers:
    580       * Worst case buffer count for any board (higher is better).
    581         This is the more siginficant number for comparison.
    582       * Number of boards at the worst case (lower is better).  This
    583         is the less significant number.
    584 
    585     Implementation note:  The score could fail to reflect the
    586     intended criteria if there are more than 1000 boards in the
    587     inventory.
    588 
    589     @param spare_counts A dictionary mapping boards to buffer counts.
    590     @param repair_list  A list of boards to be repaired.
    591     @return A numeric score.
    592 
    593     """
    594     # Go through `buffer_counts`, and create a list of new counts
    595     # that records the buffer count for each board after repair.
    596     # The new list of counts discards the board names, as they don't
    597     # contribute to the final score.
    598     _NBOARDS = 1000
    599     repair_inventory = _LabInventory(repair_list)
    600     new_counts = []
    601     for b, c in buffer_counts.items():
    602         if b in repair_inventory:
    603             newcount = repair_inventory[b].get_total()
    604         else:
    605             newcount = 0
    606         new_counts.append(c + newcount)
    607     # Go through the new list of counts.  Find the worst available
    608     # spares count, and count how many times that worst case occurs.
    609     worst_count = new_counts[0]
    610     num_worst = 1
    611     for c in new_counts[1:]:
    612         if c == worst_count:
    613             num_worst += 1
    614         elif c < worst_count:
    615             worst_count = c
    616             num_worst = 1
    617     # Return the calculated score
    618     return _NBOARDS * worst_count - num_worst
    619 
    620 
    621 def _generate_repair_recommendation(inventory, num_recommend):
    622     """Return a summary of selected DUTs needing repair.
    623 
    624     Returns a message recommending a list of broken DUTs to be
    625     repaired.  The list of DUTs is selected based on these
    626     criteria:
    627       * No more than `num_recommend` DUTs will be listed.
    628       * All DUTs must be in the same lab.
    629       * DUTs should be selected for some degree of physical
    630         proximity.
    631       * DUTs for boards with a low spares buffer are more important
    632         than DUTs with larger buffers.
    633 
    634     The algorithm used will guarantee that at least one DUT from a
    635     board with the smallest spares buffer will be recommended.  If
    636     the worst spares buffer number is shared by more than one board,
    637     the algorithm will tend to prefer repair sets that include more
    638     of those boards over sets that cover fewer boards.
    639 
    640     @param inventory      Inventory for generating recommendations.
    641     @param num_recommend  Number of DUTs to recommend for repair.
    642 
    643     """
    644     logging.debug('Creating DUT repair recommendations')
    645     board_buffer_counts = {}
    646     broken_list = []
    647     for board in inventory.get_managed_boards():
    648         logging.debug('Listing failed DUTs for %s', board)
    649         counts = inventory[board]
    650         if counts.get_broken() != 0:
    651             board_buffer_counts[board] = counts.get_spares_buffer()
    652             broken_list.extend(counts.get_broken_list())
    653     # N.B. The logic inside this loop may seem complicated, but
    654     # simplification is hard:
    655     #   * Calculating an initial recommendation outside of
    656     #     the loop likely would make things more complicated,
    657     #     not less.
    658     #   * It's necessary to calculate an initial lab slice once per
    659     #     lab _before_ the while loop, in case the number of broken
    660     #     DUTs in a lab is less than `num_recommend`.
    661     recommendation = None
    662     best_score = None
    663     for lab_duts in _sort_by_location(broken_list):
    664         start = 0
    665         end = num_recommend
    666         lab_slice = lab_duts[start : end]
    667         lab_score = _score_repair_set(board_buffer_counts,
    668                                       lab_slice)
    669         while end < len(lab_duts):
    670             start += 1
    671             end += 1
    672             new_slice = lab_duts[start : end]
    673             new_score = _score_repair_set(board_buffer_counts,
    674                                           new_slice)
    675             if new_score > lab_score:
    676                 lab_slice = new_slice
    677                 lab_score = new_score
    678         if recommendation is None or lab_score > best_score:
    679             recommendation = lab_slice
    680             best_score = lab_score
    681     # N.B. The trailing space here is manadatory:  Without it, Gmail
    682     # will parse the URL wrong.  Don't ask.  If you simply _must_
    683     # know more, go try it yourself...
    684     line_fmt = '%-30s %-16s %-6s\n    %s '
    685     message = ['Repair recommendations:\n',
    686                line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
    687     for h in recommendation:
    688         servo_name = servo_host.make_servo_hostname(h.host.hostname)
    689         servo_present = utils.host_is_in_lab_zone(servo_name)
    690         _, event = h.last_diagnosis()
    691         line = line_fmt % (
    692                 h.host.hostname, h.host_board,
    693                 'Yes' if servo_present else 'No', event.job_url)
    694         message.append(line)
    695     return '\n'.join(message)
    696 
    697 
    698 def _generate_board_inventory_message(inventory):
    699     """Generate the "board inventory" e-mail message.
    700 
    701     The board inventory is a list by board summarizing the number
    702     of working and broken DUTs, and the total shortfall or surplus
    703     of working devices relative to the minimum critical pool
    704     requirement.
    705 
    706     The report omits boards with no DUTs in the spare pool or with
    707     no DUTs in a critical pool.
    708 
    709     N.B. For sample output text formattted as users can expect to
    710     see it in e-mail and log files, refer to the unit tests.
    711 
    712     @param inventory  _LabInventory object with the inventory to
    713                       be reported on.
    714     @return String with the inventory message to be sent.
    715 
    716     """
    717     logging.debug('Creating board inventory')
    718     nworking = 0
    719     nbroken = 0
    720     nidle = 0
    721     nbroken_boards = 0
    722     ntotal_boards = 0
    723     summaries = []
    724     for board in inventory.get_managed_boards():
    725         counts = inventory[board]
    726         logging.debug('Counting %2d DUTS for board %s',
    727                       counts.get_total(), board)
    728         # Summary elements laid out in the same order as the text
    729         # headers:
    730         #     Board Avail   Bad  Idle  Good  Spare Total
    731         #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]  e[6]
    732         element = (board,
    733                    counts.get_spares_buffer(),
    734                    counts.get_broken(),
    735                    counts.get_idle(),
    736                    counts.get_working(),
    737                    counts.get_total(SPARE_POOL),
    738                    counts.get_total())
    739         if element[2]:
    740             summaries.append(element)
    741             nbroken_boards += 1
    742         ntotal_boards += 1
    743         nbroken += element[2]
    744         nidle += element[3]
    745         nworking += element[4]
    746     ntotal = nworking + nbroken + nidle
    747     summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
    748     broken_percent = int(round(100.0 * nbroken / ntotal))
    749     idle_percent = int(round(100.0 * nidle / ntotal))
    750     working_percent = 100 - broken_percent - idle_percent
    751     message = ['Summary of DUTs in inventory:',
    752                '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
    753                '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
    754                    nbroken, broken_percent,
    755                    nidle, idle_percent,
    756                    nworking, working_percent,
    757                    ntotal),
    758                '',
    759                'Boards with failures: %d' % nbroken_boards,
    760                'Boards in inventory:  %d' % ntotal_boards,
    761                '', '',
    762                'Full board inventory:\n',
    763                '%-22s %5s %5s %5s %5s %5s %5s' % (
    764                    'Board', 'Avail', 'Bad', 'Idle', 'Good',
    765                    'Spare', 'Total')]
    766     message.extend(
    767             ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
    768     return '\n'.join(message)
    769 
    770 
    771 _POOL_INVENTORY_HEADER = '''\
    772 Notice to Infrastructure deputies:  All boards shown below are at
    773 less than full strength, please take action to resolve the issues.
    774 Once you're satisified that failures won't recur, failed DUTs can
    775 be replaced with spares by running `balance_pool`.  Detailed
    776 instructions can be found here:
    777     http://go/cros-manage-duts
    778 '''
    779 
    780 
    781 def _generate_pool_inventory_message(inventory):
    782     """Generate the "pool inventory" e-mail message.
    783 
    784     The pool inventory is a list by pool and board summarizing the
    785     number of working and broken DUTs in the pool.  Only boards with
    786     at least one broken DUT are included in the list.
    787 
    788     N.B. For sample output text formattted as users can expect to
    789     see it in e-mail and log files, refer to the unit tests.
    790 
    791     @param inventory  _LabInventory object with the inventory to
    792                       be reported on.
    793     @return String with the inventory message to be sent.
    794 
    795     """
    796     logging.debug('Creating pool inventory')
    797     message = [_POOL_INVENTORY_HEADER]
    798     newline = ''
    799     for pool in CRITICAL_POOLS:
    800         message.append(
    801             '%sStatus for pool:%s, by board:' % (newline, pool))
    802         message.append(
    803             '%-20s   %5s %5s %5s %5s' % (
    804                 'Board', 'Bad', 'Idle', 'Good', 'Total'))
    805         data_list = []
    806         for board, counts in inventory.items():
    807             logging.debug('Counting %2d DUTs for %s, %s',
    808                           counts.get_total(pool), board, pool)
    809             broken = counts.get_broken(pool)
    810             idle = counts.get_idle(pool)
    811             # boards at full strength are not reported
    812             if broken == 0 and idle == 0:
    813                 continue
    814             working = counts.get_working(pool)
    815             total = counts.get_total(pool)
    816             data_list.append((board, broken, idle, working, total))
    817         if data_list:
    818             data_list = sorted(data_list, key=lambda d: -d[1])
    819             message.extend(
    820                 ['%-20s   %5d %5d %5d %5d' % t for t in data_list])
    821         else:
    822             message.append('(All boards at full strength)')
    823         newline = '\n'
    824     return '\n'.join(message)
    825 
    826 
    827 _IDLE_INVENTORY_HEADER = '''\
    828 Notice to Infrastructure deputies:  The hosts shown below haven't
    829 run any jobs for at least 24 hours. Please check each host; locked
    830 hosts should normally be unlocked; stuck jobs should normally be
    831 aborted.
    832 '''
    833 
    834 
    835 def _generate_idle_inventory_message(inventory):
    836     """Generate the "idle inventory" e-mail message.
    837 
    838     The idle inventory is a host list with corresponding pool and board,
    839     where the hosts are idle (`UNKWOWN` or `UNUSED`).
    840 
    841     N.B. For sample output text format as users can expect to
    842     see it in e-mail and log files, refer to the unit tests.
    843 
    844     @param inventory  _LabInventory object with the inventory to
    845                       be reported on.
    846     @return String with the inventory message to be sent.
    847 
    848     """
    849     logging.debug('Creating idle inventory')
    850     message = [_IDLE_INVENTORY_HEADER]
    851     message.append('Idle Host List:')
    852     message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
    853     data_list = []
    854     for pool in MANAGED_POOLS:
    855         for board, counts in inventory.items():
    856             logging.debug('Counting %2d DUTs for %s, %s',
    857                           counts.get_total(pool), board, pool)
    858             data_list.extend([(dut.host.hostname, board, pool)
    859                                   for dut in counts.get_idle_list(pool)])
    860     if data_list:
    861         message.extend(['%-30s %-20s %s' % t for t in data_list])
    862     else:
    863         message.append('(No idle DUTs)')
    864     return '\n'.join(message)
    865 
    866 
    867 def _send_email(arguments, tag, subject, recipients, body):
    868     """Send an inventory e-mail message.
    869 
    870     The message is logged in the selected log directory using `tag`
    871     for the file name.
    872 
    873     If the --print option was requested, the message is neither
    874     logged nor sent, but merely printed on stdout.
    875 
    876     @param arguments   Parsed command-line options.
    877     @param tag         Tag identifying the inventory for logging
    878                        purposes.
    879     @param subject     E-mail Subject: header line.
    880     @param recipients  E-mail addresses for the To: header line.
    881     @param body        E-mail message body.
    882 
    883     """
    884     logging.debug('Generating email: "%s"', subject)
    885     all_recipients = ', '.join(recipients)
    886     report_body = '\n'.join([
    887             'To: %s' % all_recipients,
    888             'Subject: %s' % subject,
    889             '', body, ''])
    890     if arguments.debug:
    891         print report_body
    892     else:
    893         filename = os.path.join(arguments.logdir, tag)
    894         try:
    895             report_file = open(filename, 'w')
    896             report_file.write(report_body)
    897             report_file.close()
    898         except EnvironmentError as e:
    899             logging.error('Failed to write %s:  %s', filename, e)
    900         try:
    901             gmail_lib.send_email(all_recipients, subject, body)
    902         except Exception as e:
    903             logging.error('Failed to send e-mail to %s:  %s',
    904                           all_recipients, e)
    905 
    906 
    907 def _separate_email_addresses(address_list):
    908     """Parse a list of comma-separated lists of e-mail addresses.
    909 
    910     @param address_list  A list of strings containing comma
    911                          separate e-mail addresses.
    912     @return A list of the individual e-mail addresses.
    913 
    914     """
    915     newlist = []
    916     for arg in address_list:
    917         newlist.extend([email.strip() for email in arg.split(',')])
    918     return newlist
    919 
    920 
    921 def _verify_arguments(arguments):
    922     """Validate command-line arguments.
    923 
    924     Join comma separated e-mail addresses for `--board-notify` and
    925     `--pool-notify` in separate option arguments into a single list.
    926 
    927     For non-debug uses, require that notification be requested for
    928     at least one report.  For debug, if notification isn't specified,
    929     treat it as "run all the reports."
    930 
    931     The return value indicates success or failure; in the case of
    932     failure, we also write an error message to stderr.
    933 
    934     @param arguments  Command-line arguments as returned by
    935                       `ArgumentParser`
    936     @return True if the arguments are semantically good, or False
    937             if the arguments don't meet requirements.
    938 
    939     """
    940     arguments.board_notify = _separate_email_addresses(
    941             arguments.board_notify)
    942     arguments.pool_notify = _separate_email_addresses(
    943             arguments.pool_notify)
    944     if not arguments.board_notify and not arguments.pool_notify:
    945         if not arguments.debug:
    946             sys.stderr.write('Must specify at least one of '
    947                              '--board-notify or --pool-notify\n')
    948             return False
    949         else:
    950             # We want to run all the reports.  An empty notify list
    951             # will cause a report to be skipped, so make sure the
    952             # lists are non-empty.
    953             arguments.board_notify = ['']
    954             arguments.pool_notify = ['']
    955     return True
    956 
    957 
    958 def _get_logdir(script):
    959     """Get the default directory for the `--logdir` option.
    960 
    961     The default log directory is based on the parent directory
    962     containing this script.
    963 
    964     @param script  Path to this script file.
    965     @return A path to a directory.
    966 
    967     """
    968     basedir = os.path.dirname(os.path.abspath(script))
    969     basedir = os.path.dirname(basedir)
    970     return os.path.join(basedir, _LOGDIR)
    971 
    972 
    973 def _parse_command(argv):
    974     """Parse the command line arguments.
    975 
    976     Create an argument parser for this command's syntax, parse the
    977     command line, and return the result of the ArgumentParser
    978     parse_args() method.
    979 
    980     @param argv Standard command line argument vector; argv[0] is
    981                 assumed to be the command name.
    982     @return Result returned by ArgumentParser.parse_args().
    983 
    984     """
    985     parser = argparse.ArgumentParser(
    986             prog=argv[0],
    987             description='Gather and report lab inventory statistics')
    988     parser.add_argument('-d', '--duration', type=int,
    989                         default=_DEFAULT_DURATION, metavar='HOURS',
    990                         help='number of hours back to search for status'
    991                              ' (default: %d)' % _DEFAULT_DURATION)
    992     parser.add_argument('--board-notify', action='append',
    993                         default=[], metavar='ADDRESS',
    994                         help='Generate board inventory message, '
    995                         'and send it to the given e-mail address(es)')
    996     parser.add_argument('--pool-notify', action='append',
    997                         default=[], metavar='ADDRESS',
    998                         help='Generate pool inventory message, '
    999                              'and send it to the given address(es)')
   1000     parser.add_argument('-r', '--recommend', type=int, default=None,
   1001                         help=('Specify how many DUTs should be '
   1002                               'recommended for repair (default: no '
   1003                               'recommendation)'))
   1004     parser.add_argument('--debug', action='store_true',
   1005                         help='Print e-mail messages on stdout '
   1006                              'without sending them.')
   1007     parser.add_argument('--logdir', default=_get_logdir(argv[0]),
   1008                         help='Directory where logs will be written.')
   1009     parser.add_argument('boardnames', nargs='*',
   1010                         metavar='BOARD',
   1011                         help='names of boards to report on '
   1012                              '(default: all boards)')
   1013     arguments = parser.parse_args(argv[1:])
   1014     if not _verify_arguments(arguments):
   1015         return None
   1016     return arguments
   1017 
   1018 
   1019 def _configure_logging(arguments):
   1020     """Configure the `logging` module for our needs.
   1021 
   1022     How we log depends on whether the `--print` option was
   1023     provided on the command line.  Without the option, we log all
   1024     messages at DEBUG level or above, and write them to a file in
   1025     the directory specified by the `--logdir` option.  With the
   1026     option, we write log messages to stdout; messages below INFO
   1027     level are discarded.
   1028 
   1029     The log file is configured to rotate once a week on Friday
   1030     evening, preserving ~3 months worth of history.
   1031 
   1032     @param arguments  Command-line arguments as returned by
   1033                       `ArgumentParser`
   1034 
   1035     """
   1036     root_logger = logging.getLogger()
   1037     if arguments.debug:
   1038         root_logger.setLevel(logging.INFO)
   1039         handler = logging.StreamHandler(sys.stdout)
   1040         handler.setFormatter(logging.Formatter())
   1041     else:
   1042         if not os.path.exists(arguments.logdir):
   1043             os.mkdir(arguments.logdir)
   1044         root_logger.setLevel(logging.DEBUG)
   1045         logfile = os.path.join(arguments.logdir, _LOGFILE)
   1046         handler = logging.handlers.TimedRotatingFileHandler(
   1047                 logfile, when='W4', backupCount=13)
   1048         formatter = logging.Formatter(_LOG_FORMAT,
   1049                                       time_utils.TIME_FMT)
   1050         handler.setFormatter(formatter)
   1051     # TODO(jrbarnette) This is gross.  Importing client.bin.utils
   1052     # implicitly imported logging_config, which calls
   1053     # logging.basicConfig() *at module level*.  That gives us an
   1054     # extra logging handler that we don't want.  So, clear out all
   1055     # the handlers here.
   1056     for h in root_logger.handlers:
   1057         root_logger.removeHandler(h)
   1058     root_logger.addHandler(handler)
   1059 
   1060 
   1061 def _populate_board_counts(inventory):
   1062     """Gather board counts while providing interactive feedback.
   1063 
   1064     Gathering the status of all individual DUTs in the lab can take
   1065     considerable time (~30 minutes at the time of this writing).
   1066 
   1067     Normally, we pay that cost by querying as we go.  However, with
   1068     the `--print` option, a human being may be watching the
   1069     progress.  So, we force the first (expensive) queries to happen
   1070     up front, and provide a small ASCII progress bar to give an
   1071     indicator of how many boards have been processed.
   1072 
   1073     @param inventory  _LabInventory object with the inventory to
   1074                       be gathered.
   1075 
   1076     """
   1077     n = 0
   1078     total_broken = 0
   1079     for counts in inventory.values():
   1080         n += 1
   1081         if n % 10 == 5:
   1082             c = '+'
   1083         elif n % 10 == 0:
   1084             c = '%d' % ((n / 10) % 10)
   1085         else:
   1086             c = '.'
   1087         sys.stdout.write(c)
   1088         sys.stdout.flush()
   1089         # This next call is where all the time goes - it forces all
   1090         # of a board's HostJobHistory objects to query the database
   1091         # and cache their results.
   1092         total_broken += counts.get_broken()
   1093     sys.stdout.write('\n')
   1094     sys.stdout.write('Found %d broken DUTs\n' % total_broken)
   1095 
   1096 
   1097 def main(argv):
   1098     """Standard main routine.
   1099     @param argv  Command line arguments including `sys.argv[0]`.
   1100     """
   1101     arguments = _parse_command(argv)
   1102     if not arguments:
   1103         sys.exit(1)
   1104     _configure_logging(arguments)
   1105     try:
   1106         end_time = int(time.time())
   1107         start_time = end_time - arguments.duration * 60 * 60
   1108         timestamp = time.strftime('%Y-%m-%d.%H',
   1109                                   time.localtime(end_time))
   1110         logging.debug('Starting lab inventory for %s', timestamp)
   1111         if arguments.board_notify:
   1112             if arguments.recommend:
   1113                 logging.debug('Will include repair recommendations')
   1114             logging.debug('Will include board inventory')
   1115         if arguments.pool_notify:
   1116             logging.debug('Will include pool inventory')
   1117 
   1118         afe = frontend_wrappers.RetryingAFE(server=None)
   1119         inventory = _LabInventory.create_inventory(
   1120                 afe, start_time, end_time, arguments.boardnames)
   1121         logging.info('Found %d hosts across %d boards',
   1122                          inventory.get_num_duts(),
   1123                          inventory.get_num_boards())
   1124 
   1125         if arguments.debug:
   1126             _populate_board_counts(inventory)
   1127 
   1128         if arguments.board_notify:
   1129             if arguments.recommend:
   1130                 recommend_message = _generate_repair_recommendation(
   1131                         inventory, arguments.recommend) + '\n\n\n'
   1132             else:
   1133                 recommend_message = ''
   1134             board_message = _generate_board_inventory_message(inventory)
   1135             _send_email(arguments,
   1136                         'boards-%s.txt' % timestamp,
   1137                         'DUT board inventory %s' % timestamp,
   1138                         arguments.board_notify,
   1139                         recommend_message + board_message)
   1140 
   1141         if arguments.pool_notify:
   1142             pool_message = _generate_pool_inventory_message(inventory)
   1143             idle_message = _generate_idle_inventory_message(inventory)
   1144             _send_email(arguments,
   1145                         'pools-%s.txt' % timestamp,
   1146                         'DUT pool inventory %s' % timestamp,
   1147                         arguments.pool_notify,
   1148                         pool_message + '\n\n\n' + idle_message)
   1149     except KeyboardInterrupt:
   1150         pass
   1151     except EnvironmentError as e:
   1152         logging.exception('Unexpected OS error: %s', e)
   1153     except Exception as e:
   1154         logging.exception('Unexpected exception: %s', e)
   1155 
   1156 
   1157 def get_inventory(afe):
   1158     end_time = int(time.time())
   1159     start_time = end_time - 24 * 60 * 60
   1160     return _LabInventory.create_inventory(afe, start_time, end_time)
   1161 
   1162 
   1163 def get_managed_boards(afe):
   1164     return get_inventory(afe).get_managed_boards()
   1165 
   1166 
   1167 if __name__ == '__main__':
   1168     main(sys.argv)
   1169