Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Create e-mail reports of the Lab's DUT inventory.
      7 
      8 Gathers a list of all DUTs of interest in the Lab, segregated by
      9 board and pool, and determines whether each DUT is working or
     10 broken.  Then, send one or more e-mail reports summarizing the
     11 status to e-mail addresses provided on the command line.
     12 
     13 usage:  lab_inventory.py [ options ] [ board ... ]
     14 
     15 Options:
     16 --duration / -d <hours>
     17     How far back in time to search job history to determine DUT
     18     status.
     19 
     20 --board-notify <address>[,<address>]
     21     Send the "board status" e-mail to all the specified e-mail
     22     addresses.
     23 
     24 --pool-notify <address>[,<address>]
     25     Send the "pool status" e-mail to all the specified e-mail
     26     addresses.
     27 
     28 --recommend <number>
     29     When generating the "board status" e-mail, included a list of
     30     <number> specific DUTs to be recommended for repair.
     31 
     32 --logdir <directory>
     33     Log progress and actions in a file under this directory.  Text
     34     of any e-mail sent will also be logged in a timestamped file in
     35     this directory.
     36 
     37 --debug
     38     Suppress all logging and sending e-mail.  Instead, write the
     39     output that would be generated onto stdout.
     40 
     41 <board> arguments:
     42     With no arguments, gathers the status for all boards in the lab.
     43     With one or more named boards on the command line, restricts
     44     reporting to just those boards.
     45 
     46 """
     47 
     48 
     49 import argparse
     50 import logging
     51 import logging.handlers
     52 import os
     53 import re
     54 import sys
     55 import time
     56 
     57 import common
     58 from autotest_lib.client.bin import utils
     59 from autotest_lib.client.common_lib import time_utils
     60 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     61 from autotest_lib.server.hosts import servo_host
     62 from autotest_lib.site_utils import gmail_lib
     63 from autotest_lib.site_utils import status_history
     64 from autotest_lib.site_utils.suite_scheduler import constants
     65 
     66 
     67 # The pools in the Lab that are actually of interest.
     68 #
     69 # These are general purpose pools of DUTs that are considered
     70 # identical for purposes of testing.  That is, a device in one of
     71 # these pools can be shifted to another pool at will for purposes
     72 # of supplying test demand.
     73 #
     74 # Devices in these pools are not allowed to have special-purpose
     75 # attachments, or to be part of in any kind of custom fixture.
     76 # Devices in these pools are also required to reside in areas
     77 # managed by the Platforms team (i.e. at the time of this writing,
     78 # only in "Atlantis" or "Destiny").
     79 #
     80 # _CRITICAL_POOLS - Pools that must be kept fully supplied in order
     81 #     to guarantee timely completion of tests from builders.
     82 # _SPARE_POOL - A low priority pool that is allowed to provide
     83 #     spares to replace broken devices in the critical pools.
     84 # _MANAGED_POOLS - The set of all the general purpose pools
     85 #     monitored by this script.
     86 
     87 _CRITICAL_POOLS = ['bvt', 'cq', 'continuous']
     88 _SPARE_POOL = 'suites'
     89 _MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
     90 
     91 # _DEFAULT_DURATION:
     92 #     Default value used for the --duration command line option.
     93 #     Specifies how far back in time to search in order to determine
     94 #     DUT status.
     95 
     96 _DEFAULT_DURATION = 24
     97 
     98 # _LOGDIR:
     99 #     Relative path used in the calculation of the default setting
    100 #     for the --logdir option.  The full path path is relative to
    101 #     the root of the autotest directory, as determined from
    102 #     sys.argv[0].
    103 # _LOGFILE:
    104 #     Basename of a file to which general log information will be
    105 #     written.
    106 # _LOG_FORMAT:
    107 #     Format string for log messages.
    108 
    109 _LOGDIR = os.path.join('logs', 'dut-data')
    110 _LOGFILE = 'lab-inventory.log'
    111 _LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
    112 
    113 # Pattern describing location-based host names in the Chrome OS test
    114 # labs.  Each DUT hostname designates the DUT's location:
    115 #   * A lab (room) that's physically separated from other labs
    116 #     (i.e. there's a door).
    117 #   * A row (or aisle) of DUTs within the lab.
    118 #   * A vertical rack of shelves on the row.
    119 #   * A specific host on one shelf of the rack.
    120 
    121 _HOSTNAME_PATTERN = re.compile(
    122         r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
    123 
    124 
    125 class _PoolCounts(object):
    126     """Maintains a set of `HostJobHistory` objects for a pool.
    127 
    128     The collected history objects are nominally all part of a single
    129     scheduling pool of DUTs.  The collection maintains a list of
    130     working DUTs, a list of broken DUTs, and a list of all DUTs.
    131 
    132     Performance note:  Certain methods in this class are potentially
    133     expensive:
    134       * `get_working()`
    135       * `get_working_list()`
    136       * `get_broken()`
    137       * `get_broken_list()`
    138     The first time any one of these methods is called, it causes
    139     multiple RPC calls with a relatively expensive set of database
    140     queries.  However, the results of the queries are cached in the
    141     individual `HostJobHistory` objects, so only the first call
    142     actually pays the full cost.
    143 
    144     Additionally, `get_working_list()` and `get_broken_list()` both
    145     cache their return values to avoid recalculating lists at every
    146     call; this caching is separate from the caching of RPC results
    147     described above.
    148 
    149     This class is deliberately constructed to delay the RPC cost
    150     until the accessor methods are called (rather than to query in
    151     `record_host()`) so that it's possible to construct a complete
    152     `_LabInventory` without making the expensive queries at creation
    153     time.  `_populate_board_counts()`, below, assumes this behavior.
    154 
    155     """
    156 
    157     def __init__(self):
    158         self._histories = []
    159         self._working_list = None
    160         self._broken_list = None
    161 
    162 
    163     def record_host(self, host_history):
    164         """Add one `HostJobHistory` object to the collection.
    165 
    166         @param host_history The `HostJobHistory` object to be
    167                             remembered.
    168 
    169         """
    170         self._working_list = None
    171         self._broken_list = None
    172         self._histories.append(host_history)
    173 
    174 
    175     def get_working_list(self):
    176         """Return a list of all working DUTs in the pool.
    177 
    178         Filter `self._histories` for histories where the last
    179         diagnosis is `WORKING`.
    180 
    181         Cache the result so that we only cacluate it once.
    182 
    183         @return A list of HostJobHistory objects.
    184 
    185         """
    186         if self._working_list is None:
    187             self._working_list = [h for h in self._histories
    188                     if h.last_diagnosis()[0] == status_history.WORKING]
    189         return self._working_list
    190 
    191 
    192     def get_working(self):
    193         """Return the number of working DUTs in the pool."""
    194         return len(self.get_working_list())
    195 
    196 
    197     def get_broken_list(self):
    198         """Return a list of all broken DUTs in the pool.
    199 
    200         Filter `self._histories` for histories where the last
    201         diagnosis is not `WORKING`.
    202 
    203         Cache the result so that we only cacluate it once.
    204 
    205         @return A list of HostJobHistory objects.
    206 
    207         """
    208         if self._broken_list is None:
    209             self._broken_list = [h for h in self._histories
    210                     if h.last_diagnosis()[0] != status_history.WORKING]
    211         return self._broken_list
    212 
    213 
    214     def get_broken(self):
    215         """Return the number of broken DUTs in the pool."""
    216         return len(self.get_broken_list())
    217 
    218 
    219     def get_total(self):
    220         """Return the total number of DUTs in the pool."""
    221         return len(self._histories)
    222 
    223 
    224 class _BoardCounts(object):
    225     """Maintains a set of `HostJobHistory` objects for a board.
    226 
    227     The collected history objects are nominally all of the same
    228     board.  The collection maintains a count of working DUTs, a
    229     count of broken DUTs, and a total count.  The counts can be
    230     obtained either for a single pool, or as a total across all
    231     pools.
    232 
    233     DUTs in the collection must be assigned to one of the pools
    234     in `_MANAGED_POOLS`.
    235 
    236     The `get_working()` and `get_broken()` methods rely on the
    237     methods of the same name in _PoolCounts, so the performance
    238     note in _PoolCounts applies here as well.
    239 
    240     """
    241 
    242     def __init__(self):
    243         self._pools = {
    244             pool: _PoolCounts() for pool in _MANAGED_POOLS
    245         }
    246 
    247     def record_host(self, host_history):
    248         """Add one `HostJobHistory` object to the collection.
    249 
    250         @param host_history The `HostJobHistory` object to be
    251                             remembered.
    252 
    253         """
    254         pool = host_history.host_pool
    255         self._pools[pool].record_host(host_history)
    256 
    257 
    258     def _count_pool(self, get_pool_count, pool=None):
    259         """Internal helper to count hosts in a given pool.
    260 
    261         The `get_pool_count` parameter is a function to calculate
    262         the exact count of interest for the pool.
    263 
    264         @param get_pool_count  Function to return a count from a
    265                                _PoolCount object.
    266         @param pool            The pool to be counted.  If `None`,
    267                                return the total across all pools.
    268 
    269         """
    270         if pool is None:
    271             return sum([get_pool_count(counts)
    272                             for counts in self._pools.values()])
    273         else:
    274             return get_pool_count(self._pools[pool])
    275 
    276 
    277     def get_working_list(self):
    278         """Return a list of all working DUTs for the board.
    279 
    280         Go through all HostJobHistory objects in the board's pools,
    281         selecting the ones where the last diagnosis is `WORKING`.
    282 
    283         @return A list of HostJobHistory objects.
    284 
    285         """
    286         l = []
    287         for p in self._pools.values():
    288             l.extend(p.get_working_list())
    289         return l
    290 
    291 
    292     def get_working(self, pool=None):
    293         """Return the number of working DUTs in a pool.
    294 
    295         @param pool  The pool to be counted.  If `None`, return the
    296                      total across all pools.
    297 
    298         @return The total number of working DUTs in the selected
    299                 pool(s).
    300         """
    301         return self._count_pool(_PoolCounts.get_working, pool)
    302 
    303 
    304     def get_broken_list(self):
    305         """Return a list of all broken DUTs for the board.
    306 
    307         Go through all HostJobHistory objects in the board's pools,
    308         selecting the ones where the last diagnosis is not
    309         `WORKING`.
    310 
    311         @return A list of HostJobHistory objects.
    312 
    313         """
    314         l = []
    315         for p in self._pools.values():
    316             l.extend(p.get_broken_list())
    317         return l
    318 
    319 
    320     def get_broken(self, pool=None):
    321         """Return the number of broken DUTs in a pool.
    322 
    323         @param pool  The pool to be counted.  If `None`, return the
    324                      total across all pools.
    325 
    326         @return The total number of broken DUTs in the selected pool(s).
    327         """
    328         return self._count_pool(_PoolCounts.get_broken, pool)
    329 
    330 
    331     def get_spares_buffer(self):
    332         """Return the the nominal number of working spares.
    333 
    334         Calculates and returns how many working spares there would
    335         be in the spares pool if all broken DUTs were in the spares
    336         pool.  This number may be negative, indicating a shortfall
    337         in the critical pools.
    338 
    339         @return The total number DUTs in the spares pool, less the total
    340                 number of broken DUTs in all pools.
    341         """
    342         return self.get_total(_SPARE_POOL) - self.get_broken()
    343 
    344 
    345     def get_total(self, pool=None):
    346         """Return the total number of DUTs in a pool.
    347 
    348         @param pool  The pool to be counted.  If `None`, return the
    349                      total across all pools.
    350 
    351         @return The total number of DUTs in the selected pool(s).
    352         """
    353         return self._count_pool(_PoolCounts.get_total, pool)
    354 
    355 
    356 class _LabInventory(dict):
    357     """Collection of `HostJobHistory` objects for the Lab's inventory.
    358 
    359     The collection is indexed by board.  Indexing returns the
    360     _BoardCounts object associated with the board.
    361 
    362     The collection is also iterable.  The iterator returns all the
    363     boards in the inventory, in unspecified order.
    364 
    365     """
    366 
    367     @classmethod
    368     def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
    369         """Return a Lab inventory with specified parameters.
    370 
    371         By default, gathers inventory from `HostJobHistory` objects
    372         for all DUTs in the `_MANAGED_POOLS` list.  If `boardlist`
    373         is supplied, the inventory will be restricted to only the
    374         given boards.
    375 
    376         @param afe         AFE object for constructing the
    377                            `HostJobHistory` objects.
    378         @param start_time  Start time for the `HostJobHistory`
    379                            objects.
    380         @param end_time    End time for the `HostJobHistory`
    381                            objects.
    382         @param boardlist   List of boards to include.  If empty,
    383                            include all available boards.
    384         @return A `_LabInventory` object for the specified boards.
    385 
    386         """
    387         label_list = [constants.Labels.POOL_PREFIX + l
    388                           for l in _MANAGED_POOLS]
    389         afehosts = afe.get_hosts(labels__name__in=label_list)
    390         if boardlist:
    391             boardhosts = []
    392             for board in boardlist:
    393                 board_label = constants.Labels.BOARD_PREFIX + board
    394                 host_list = [h for h in afehosts
    395                                   if board_label in h.labels]
    396                 boardhosts.extend(host_list)
    397             afehosts = boardhosts
    398         create = lambda host: (
    399                 status_history.HostJobHistory(afe, host,
    400                                               start_time, end_time))
    401         return cls([create(host) for host in afehosts])
    402 
    403 
    404     def __init__(self, histories):
    405         # N.B. The query that finds our hosts is restricted to those
    406         # with a valid pool: label, but doesn't check for a valid
    407         # board: label.  In some (insufficiently) rare cases, the
    408         # AFE hosts table has been known to (incorrectly) have DUTs
    409         # with a pool: but no board: label.  We explicitly exclude
    410         # those here.
    411         histories = [h for h in histories
    412                      if h.host_board is not None]
    413         boards = set([h.host_board for h in histories])
    414         initval = { board: _BoardCounts() for board in boards }
    415         super(_LabInventory, self).__init__(initval)
    416         self._dut_count = len(histories)
    417         self._managed_boards = None
    418         for h in histories:
    419             self[h.host_board].record_host(h)
    420 
    421 
    422     def get_managed_boards(self):
    423         """Return the set of "managed" boards.
    424 
    425         Operationally, saying a board is "managed" means that the
    426         board will be included in the "board" and "repair
    427         recommendations" reports.  That is, if there are failures in
    428         the board's inventory then lab techs will be asked to fix
    429         them without a separate ticket.
    430 
    431         For purposes of implementation, a board is "managed" if it
    432         has DUTs in both the spare and a non-spare (i.e. critical)
    433         pool.
    434 
    435         @return A set of all the boards that have both spare and
    436                 non-spare pools.
    437         """
    438         if self._managed_boards is None:
    439             self._managed_boards = set()
    440             for board, counts in self.items():
    441                 spares = counts.get_total(_SPARE_POOL)
    442                 total = counts.get_total()
    443                 if spares != 0 and spares != total:
    444                     self._managed_boards.add(board)
    445         return self._managed_boards
    446 
    447 
    448     def get_num_duts(self):
    449         """Return the total number of DUTs in the inventory."""
    450         return self._dut_count
    451 
    452 
    453     def get_num_boards(self):
    454         """Return the total number of boards in the inventory."""
    455         return len(self)
    456 
    457 
    458 def _sort_by_location(inventory_list):
    459     """Return a list of DUTs, organized by location.
    460 
    461     Take the given list of `HostJobHistory` objects, separate it
    462     into a list per lab, and sort each lab's list by location.  The
    463     order of sorting within a lab is
    464       * By row number within the lab,
    465       * then by rack number within the row,
    466       * then by host shelf number within the rack.
    467 
    468     Return a list of the sorted lists.
    469 
    470     Implementation note: host locations are sorted by converting
    471     each location into a base 100 number.  If row, rack or
    472     host numbers exceed the range [0..99], then sorting will
    473     break down.
    474 
    475     @return A list of sorted lists of DUTs.
    476 
    477     """
    478     BASE = 100
    479     lab_lists = {}
    480     for history in inventory_list:
    481         location = _HOSTNAME_PATTERN.match(history.host.hostname)
    482         if location:
    483             lab = location.group(1)
    484             key = 0
    485             for idx in location.group(2, 3, 4):
    486                 key = BASE * key + int(idx)
    487             lab_lists.setdefault(lab, []).append((key, history))
    488     return_list = []
    489     for dut_list in lab_lists.values():
    490         dut_list.sort(key=lambda t: t[0])
    491         return_list.append([t[1] for t in dut_list])
    492     return return_list
    493 
    494 
    495 def _score_repair_set(buffer_counts, repair_list):
    496     """Return a numeric score rating a set of DUTs to be repaired.
    497 
    498     `buffer_counts` is a dictionary mapping board names to the
    499     size of the board's spares buffer.
    500 
    501     `repair_list` is a list of DUTs to be repaired.
    502 
    503     This function calculates the new set of buffer counts that would
    504     result from the proposed repairs, and scores the new set using
    505     two numbers:
    506       * Worst case buffer count for any board (higher is better).
    507         This is the more siginficant number for comparison.
    508       * Number of boards at the worst case (lower is better).  This
    509         is the less significant number.
    510 
    511     Implementation note:  The score could fail to reflect the
    512     intended criteria if there are more than 1000 boards in the
    513     inventory.
    514 
    515     @param spare_counts A dictionary mapping boards to buffer counts.
    516     @param repair_list  A list of boards to be repaired.
    517     @return A numeric score.
    518 
    519     """
    520     # Go through `buffer_counts`, and create a list of new counts
    521     # that records the buffer count for each board after repair.
    522     # The new list of counts discards the board names, as they don't
    523     # contribute to the final score.
    524     _NBOARDS = 1000
    525     repair_inventory = _LabInventory(repair_list)
    526     new_counts = []
    527     for b, c in buffer_counts.items():
    528         if b in repair_inventory:
    529             newcount = repair_inventory[b].get_total()
    530         else:
    531             newcount = 0
    532         new_counts.append(c + newcount)
    533     # Go through the new list of counts.  Find the worst available
    534     # spares count, and count how many times that worst case occurs.
    535     worst_count = new_counts[0]
    536     num_worst = 1
    537     for c in new_counts[1:]:
    538         if c == worst_count:
    539             num_worst += 1
    540         elif c < worst_count:
    541             worst_count = c
    542             num_worst = 1
    543     # Return the calculated score
    544     return _NBOARDS * worst_count - num_worst
    545 
    546 
    547 def _generate_repair_recommendation(inventory, num_recommend):
    548     """Return a summary of selected DUTs needing repair.
    549 
    550     Returns a message recommending a list of broken DUTs to be
    551     repaired.  The list of DUTs is selected based on these
    552     criteria:
    553       * No more than `num_recommend` DUTs will be listed.
    554       * All DUTs must be in the same lab.
    555       * DUTs should be selected for some degree of physical
    556         proximity.
    557       * DUTs for boards with a low spares buffer are more important
    558         than DUTs with larger buffers.
    559 
    560     The algorithm used will guarantee that at least one DUT from a
    561     board with the smallest spares buffer will be recommended.  If
    562     the worst spares buffer number is shared by more than one board,
    563     the algorithm will tend to prefer repair sets that include more
    564     of those boards over sets that cover fewer boards.
    565 
    566     @param inventory      Inventory for generating recommendations.
    567     @param num_recommend  Number of DUTs to recommend for repair.
    568 
    569     """
    570     logging.debug('Creating DUT repair recommendations')
    571     board_buffer_counts = {}
    572     broken_list = []
    573     for board in inventory.get_managed_boards():
    574         logging.debug('Listing failed DUTs for %s', board)
    575         counts = inventory[board]
    576         if counts.get_broken() != 0:
    577             board_buffer_counts[board] = counts.get_spares_buffer()
    578             broken_list.extend(counts.get_broken_list())
    579     # N.B. The logic inside this loop may seem complicated, but
    580     # simplification is hard:
    581     #   * Calculating an initial recommendation outside of
    582     #     the loop likely would make things more complicated,
    583     #     not less.
    584     #   * It's necessary to calculate an initial lab slice once per
    585     #     lab _before_ the while loop, in case the number of broken
    586     #     DUTs in a lab is less than `num_recommend`.
    587     recommendation = None
    588     best_score = None
    589     for lab_duts in _sort_by_location(broken_list):
    590         start = 0
    591         end = num_recommend
    592         lab_slice = lab_duts[start : end]
    593         lab_score = _score_repair_set(board_buffer_counts,
    594                                       lab_slice)
    595         while end < len(lab_duts):
    596             start += 1
    597             end += 1
    598             new_slice = lab_duts[start : end]
    599             new_score = _score_repair_set(board_buffer_counts,
    600                                           new_slice)
    601             if new_score > lab_score:
    602                 lab_slice = new_slice
    603                 lab_score = new_score
    604         if recommendation is None or lab_score > best_score:
    605             recommendation = lab_slice
    606             best_score = lab_score
    607     message = ['Repair recommendations:\n',
    608                '%-30s %-16s %s' % (
    609                        'Hostname', 'Board', 'Servo instructions')]
    610     for h in recommendation:
    611         servo_name = servo_host.make_servo_hostname(h.host.hostname)
    612         if utils.host_is_in_lab_zone(servo_name):
    613             servo_message = 'Repair servo first'
    614         else:
    615             servo_message = 'No servo present'
    616         line = '%-30s %-16s %s' % (
    617                 h.host.hostname, h.host_board, servo_message)
    618         message.append(line)
    619     return '\n'.join(message)
    620 
    621 
    622 def _generate_board_inventory_message(inventory):
    623     """Generate the "board inventory" e-mail message.
    624 
    625     The board inventory is a list by board summarizing the number
    626     of working and broken DUTs, and the total shortfall or surplus
    627     of working devices relative to the minimum critical pool
    628     requirement.
    629 
    630     The report omits boards with no DUTs in the spare pool or with
    631     no DUTs in a critical pool.
    632 
    633     N.B. For sample output text formattted as users can expect to
    634     see it in e-mail and log files, refer to the unit tests.
    635 
    636     @param inventory  _LabInventory object with the inventory to
    637                       be reported on.
    638     @return String with the inventory message to be sent.
    639 
    640     """
    641     logging.debug('Creating board inventory')
    642     nworking = 0
    643     nbroken = 0
    644     nbroken_boards = 0
    645     summaries = []
    646     for board in inventory.get_managed_boards():
    647         logging.debug('Counting board inventory for %s', board)
    648         counts = inventory[board]
    649         # Summary elements laid out in the same order as the text
    650         # headers:
    651         #     Board Avail   Bad  Good Spare Total
    652         #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]
    653         element = (board,
    654                    counts.get_spares_buffer(),
    655                    counts.get_broken(),
    656                    counts.get_working(),
    657                    counts.get_total(_SPARE_POOL),
    658                    counts.get_total())
    659         summaries.append(element)
    660         nbroken += element[2]
    661         nworking += element[3]
    662         if element[2]:
    663             nbroken_boards += 1
    664     ntotal = nworking + nbroken
    665     summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
    666     broken_percent = int(round(100.0 * nbroken / ntotal))
    667     working_percent = 100 - broken_percent
    668     message = ['Summary of DUTs in inventory:',
    669                '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
    670                '%5d %3d%% %5d %3d%% %6d' % (
    671                    nbroken, broken_percent,
    672                    nworking, working_percent,
    673                    ntotal),
    674                '',
    675                'Boards with failures: %d' % nbroken_boards,
    676                'Boards in inventory:  %d' % len(summaries),
    677                '', '',
    678                'Full board inventory:\n',
    679                '%-22s %5s %5s %5s %5s %5s' % (
    680                    'Board', 'Avail', 'Bad', 'Good',
    681                    'Spare', 'Total')]
    682     message.extend(
    683             ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
    684     return '\n'.join(message)
    685 
    686 
    687 _POOL_INVENTORY_HEADER = '''\
    688 Notice to Infrastructure deputies:  All boards shown below are at
    689 less than full strength, please take action to resolve the issues.
    690 Once you're satisified that failures won't recur, failed DUTs can
    691 be replaced with spares by running `balance_pool`.  Detailed
    692 instructions can be found here:
    693     http://go/cros-manage-duts
    694 '''
    695 
    696 
    697 def _generate_pool_inventory_message(inventory):
    698     """Generate the "pool inventory" e-mail message.
    699 
    700     The pool inventory is a list by pool and board summarizing the
    701     number of working and broken DUTs in the pool.  Only boards with
    702     at least one broken DUT are included in the list.
    703 
    704     N.B. For sample output text formattted as users can expect to
    705     see it in e-mail and log files, refer to the unit tests.
    706 
    707     @param inventory  _LabInventory object with the inventory to
    708                       be reported on.
    709     @return String with the inventory message to be sent.
    710 
    711     """
    712     logging.debug('Creating pool inventory')
    713     message = [_POOL_INVENTORY_HEADER]
    714     newline = ''
    715     for pool in _CRITICAL_POOLS:
    716         message.append(
    717             '%sStatus for pool:%s, by board:' % (newline, pool))
    718         message.append(
    719             '%-20s   %5s %5s %5s' % (
    720                 'Board', 'Bad', 'Good', 'Total'))
    721         data_list = []
    722         for board, counts in inventory.items():
    723             logging.debug('Counting inventory for %s, %s',
    724                           board, pool)
    725             broken = counts.get_broken(pool)
    726             if broken == 0:
    727                 continue
    728             working = counts.get_working(pool)
    729             total = counts.get_total(pool)
    730             data_list.append((board, broken, working, total))
    731         if data_list:
    732             data_list = sorted(data_list, key=lambda d: -d[1])
    733             message.extend(
    734                 ['%-20s   %5d %5d %5d' % t for t in data_list])
    735         else:
    736             message.append('(All boards at full strength)')
    737         newline = '\n'
    738     return '\n'.join(message)
    739 
    740 
    741 def _send_email(arguments, tag, subject, recipients, body):
    742     """Send an inventory e-mail message.
    743 
    744     The message is logged in the selected log directory using `tag`
    745     for the file name.
    746 
    747     If the --print option was requested, the message is neither
    748     logged nor sent, but merely printed on stdout.
    749 
    750     @param arguments   Parsed command-line options.
    751     @param tag         Tag identifying the inventory for logging
    752                        purposes.
    753     @param subject     E-mail Subject: header line.
    754     @param recipients  E-mail addresses for the To: header line.
    755     @param body        E-mail message body.
    756 
    757     """
    758     logging.debug('Generating email: "%s"', subject)
    759     all_recipients = ', '.join(recipients)
    760     report_body = '\n'.join([
    761             'To: %s' % all_recipients,
    762             'Subject: %s' % subject,
    763             '', body, ''])
    764     if arguments.debug:
    765         print report_body
    766     else:
    767         filename = os.path.join(arguments.logdir, tag)
    768         try:
    769             report_file = open(filename, 'w')
    770             report_file.write(report_body)
    771             report_file.close()
    772         except EnvironmentError as e:
    773             logging.error('Failed to write %s:  %s', filename, e)
    774         try:
    775             gmail_lib.send_email(all_recipients, subject, body)
    776         except Exception as e:
    777             logging.error('Failed to send e-mail to %s:  %s',
    778                           all_recipients, e)
    779 
    780 
    781 def _separate_email_addresses(address_list):
    782     """Parse a list of comma-separated lists of e-mail addresses.
    783 
    784     @param address_list  A list of strings containing comma
    785                          separate e-mail addresses.
    786     @return A list of the individual e-mail addresses.
    787 
    788     """
    789     newlist = []
    790     for arg in address_list:
    791         newlist.extend([email.strip() for email in arg.split(',')])
    792     return newlist
    793 
    794 
    795 def _verify_arguments(arguments):
    796     """Validate command-line arguments.
    797 
    798     Join comma separated e-mail addresses for `--board-notify` and
    799     `--pool-notify` in separate option arguments into a single list.
    800 
    801     For non-debug uses, require that notification be requested for
    802     at least one report.  For debug, if notification isn't specified,
    803     treat it as "run all the reports."
    804 
    805     The return value indicates success or failure; in the case of
    806     failure, we also write an error message to stderr.
    807 
    808     @param arguments  Command-line arguments as returned by
    809                       `ArgumentParser`
    810     @return True if the arguments are semantically good, or False
    811             if the arguments don't meet requirements.
    812 
    813     """
    814     arguments.board_notify = _separate_email_addresses(
    815             arguments.board_notify)
    816     arguments.pool_notify = _separate_email_addresses(
    817             arguments.pool_notify)
    818     if not arguments.board_notify and not arguments.pool_notify:
    819         if not arguments.debug:
    820             sys.stderr.write('Must specify at least one of '
    821                              '--board-notify or --pool-notify\n')
    822             return False
    823         else:
    824             # We want to run all the reports.  An empty notify list
    825             # will cause a report to be skipped, so make sure the
    826             # lists are non-empty.
    827             arguments.board_notify = ['']
    828             arguments.pool_notify = ['']
    829     return True
    830 
    831 
    832 def _get_logdir(script):
    833     """Get the default directory for the `--logdir` option.
    834 
    835     The default log directory is based on the parent directory
    836     containing this script.
    837 
    838     @param script  Path to this script file.
    839     @return A path to a directory.
    840 
    841     """
    842     basedir = os.path.dirname(os.path.abspath(script))
    843     basedir = os.path.dirname(basedir)
    844     return os.path.join(basedir, _LOGDIR)
    845 
    846 
    847 def _parse_command(argv):
    848     """Parse the command line arguments.
    849 
    850     Create an argument parser for this command's syntax, parse the
    851     command line, and return the result of the ArgumentParser
    852     parse_args() method.
    853 
    854     @param argv Standard command line argument vector; argv[0] is
    855                 assumed to be the command name.
    856     @return Result returned by ArgumentParser.parse_args().
    857 
    858     """
    859     parser = argparse.ArgumentParser(
    860             prog=argv[0],
    861             description='Gather and report lab inventory statistics')
    862     parser.add_argument('-d', '--duration', type=int,
    863                         default=_DEFAULT_DURATION, metavar='HOURS',
    864                         help='number of hours back to search for status'
    865                              ' (default: %d)' % _DEFAULT_DURATION)
    866     parser.add_argument('--board-notify', action='append',
    867                         default=[], metavar='ADDRESS',
    868                         help='Generate board inventory message, '
    869                         'and send it to the given e-mail address(es)')
    870     parser.add_argument('--pool-notify', action='append',
    871                         default=[], metavar='ADDRESS',
    872                         help='Generate pool inventory message, '
    873                              'and send it to the given address(es)')
    874     parser.add_argument('-r', '--recommend', type=int, default=None,
    875                         help=('Specify how many DUTs should be '
    876                               'recommended for repair (default: no '
    877                               'recommendation)'))
    878     parser.add_argument('--debug', action='store_true',
    879                         help='Print e-mail messages on stdout '
    880                              'without sending them.')
    881     parser.add_argument('--logdir', default=_get_logdir(argv[0]),
    882                         help='Directory where logs will be written.')
    883     parser.add_argument('boardnames', nargs='*',
    884                         metavar='BOARD',
    885                         help='names of boards to report on '
    886                              '(default: all boards)')
    887     arguments = parser.parse_args(argv[1:])
    888     if not _verify_arguments(arguments):
    889         return None
    890     return arguments
    891 
    892 
    893 def _configure_logging(arguments):
    894     """Configure the `logging` module for our needs.
    895 
    896     How we log depends on whether the `--print` option was
    897     provided on the command line.  Without the option, we log all
    898     messages at DEBUG level or above, and write them to a file in
    899     the directory specified by the `--logdir` option.  With the
    900     option, we write log messages to stdout; messages below INFO
    901     level are discarded.
    902 
    903     The log file is configured to rotate once a week on Friday
    904     evening, preserving ~3 months worth of history.
    905 
    906     @param arguments  Command-line arguments as returned by
    907                       `ArgumentParser`
    908 
    909     """
    910     root_logger = logging.getLogger()
    911     if arguments.debug:
    912         root_logger.setLevel(logging.INFO)
    913         handler = logging.StreamHandler(sys.stdout)
    914         handler.setFormatter(logging.Formatter())
    915     else:
    916         root_logger.setLevel(logging.DEBUG)
    917         logfile = os.path.join(arguments.logdir, _LOGFILE)
    918         handler = logging.handlers.TimedRotatingFileHandler(
    919                 logfile, when='W4', backupCount=13)
    920         formatter = logging.Formatter(_LOG_FORMAT,
    921                                       time_utils.TIME_FMT)
    922         handler.setFormatter(formatter)
    923     # TODO(jrbarnette) This is gross.  Importing client.bin.utils
    924     # implicitly imported logging_config, which calls
    925     # logging.basicConfig() *at module level*.  That gives us an
    926     # extra logging handler that we don't want.  So, clear out all
    927     # the handlers here.
    928     for h in root_logger.handlers:
    929         root_logger.removeHandler(h)
    930     root_logger.addHandler(handler)
    931 
    932 
    933 def _populate_board_counts(inventory):
    934     """Gather board counts while providing interactive feedback.
    935 
    936     Gathering the status of all individual DUTs in the lab can take
    937     considerable time (~30 minutes at the time of this writing).
    938 
    939     Normally, we pay that cost by querying as we go.  However, with
    940     the `--print` option, a human being may be watching the
    941     progress.  So, we force the first (expensive) queries to happen
    942     up front, and provide a small ASCII progress bar to give an
    943     indicator of how many boards have been processed.
    944 
    945     @param inventory  _LabInventory object with the inventory to
    946                       be gathered.
    947 
    948     """
    949     n = 0
    950     total_broken = 0
    951     for counts in inventory.values():
    952         n += 1
    953         if n % 10 == 5:
    954             c = '+'
    955         elif n % 10 == 0:
    956             c = '%d' % ((n / 10) % 10)
    957         else:
    958             c = '.'
    959         sys.stdout.write(c)
    960         sys.stdout.flush()
    961         # This next call is where all the time goes - it forces all
    962         # of a board's HostJobHistory objects to query the database
    963         # and cache their results.
    964         total_broken += counts.get_broken()
    965     sys.stdout.write('\n')
    966     sys.stdout.write('Found %d broken DUTs\n' % total_broken)
    967 
    968 
    969 def main(argv):
    970     """Standard main routine.
    971     @param argv  Command line arguments including `sys.argv[0]`.
    972     """
    973     arguments = _parse_command(argv)
    974     if not arguments:
    975         sys.exit(1)
    976     _configure_logging(arguments)
    977     try:
    978         end_time = int(time.time())
    979         start_time = end_time - arguments.duration * 60 * 60
    980         timestamp = time.strftime('%Y-%m-%d.%H',
    981                                   time.localtime(end_time))
    982         logging.debug('Starting lab inventory for %s', timestamp)
    983         if arguments.board_notify:
    984             if arguments.recommend:
    985                 logging.debug('Will include repair recommendations')
    986             logging.debug('Will include board inventory')
    987         if arguments.pool_notify:
    988             logging.debug('Will include pool inventory')
    989 
    990         afe = frontend_wrappers.RetryingAFE(server=None)
    991         inventory = _LabInventory.create_inventory(
    992                 afe, start_time, end_time, arguments.boardnames)
    993         logging.info('Found %d hosts across %d boards',
    994                          inventory.get_num_duts(),
    995                          inventory.get_num_boards())
    996 
    997         if arguments.debug:
    998             _populate_board_counts(inventory)
    999 
   1000         if arguments.board_notify:
   1001             if arguments.recommend:
   1002                 recommend_message = _generate_repair_recommendation(
   1003                         inventory, arguments.recommend) + '\n\n\n'
   1004             else:
   1005                 recommend_message = ''
   1006             board_message = _generate_board_inventory_message(inventory)
   1007             _send_email(arguments,
   1008                         'boards-%s.txt' % timestamp,
   1009                         'DUT board inventory %s' % timestamp,
   1010                         arguments.board_notify,
   1011                         recommend_message + board_message)
   1012 
   1013         if arguments.pool_notify:
   1014             _send_email(arguments,
   1015                         'pools-%s.txt' % timestamp,
   1016                         'DUT pool inventory %s' % timestamp,
   1017                         arguments.pool_notify,
   1018                         _generate_pool_inventory_message(inventory))
   1019     except KeyboardInterrupt:
   1020         pass
   1021     except EnvironmentError as e:
   1022         logging.exception('Unexpected OS error: %s', e)
   1023     except Exception as e:
   1024         logging.exception('Unexpected exception: %s', e)
   1025 
   1026 
   1027 def get_managed_boards(afe):
   1028     end_time = int(time.time())
   1029     start_time = end_time - 24 * 60 * 60
   1030     inventory = _LabInventory.create_inventory(
   1031             afe, start_time, end_time)
   1032     return inventory.get_managed_boards()
   1033 
   1034 
   1035 if __name__ == '__main__':
   1036     main(sys.argv)
   1037