1 #!/usr/bin/env python 2 # Copyright 2015 The Chromium OS Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """Adjust pool balances to cover DUT shortfalls. 7 8 This command takes all broken DUTs in a specific pool for specific 9 boards and swaps them with working DUTs taken from a selected pool 10 of spares. The command is meant primarily for replacing broken DUTs 11 in critical pools like BVT or CQ, but it can also be used to adjust 12 pool sizes, or to create or remove pools. 13 14 usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ] 15 16 positional arguments: 17 POOL Name of the pool to balance 18 BOARD Names of boards to balance 19 20 optional arguments: 21 -h, --help show this help message and exit 22 -t COUNT, --total COUNT 23 Set the number of DUTs in the pool to the specified 24 count for every BOARD 25 -a COUNT, --grow COUNT 26 Add the specified number of DUTs to the pool for every 27 BOARD 28 -d COUNT, --shrink COUNT 29 Remove the specified number of DUTs from the pool for 30 every BOARD 31 -s POOL, --spare POOL 32 Pool from which to draw replacement spares (default: 33 pool:suites) 34 -n, --dry-run Report actions to take in the form of shell commands 35 36 37 The command attempts to remove all broken DUTs from the target POOL 38 for every BOARD, and replace them with enough working DUTs taken 39 from the spare pool to bring the strength of POOL to the requested 40 total COUNT. 41 42 If no COUNT options are supplied (i.e. there are no --total, --grow, 43 or --shrink options), the command will maintain the current totals of 44 DUTs for every BOARD in the target POOL. 45 46 If not enough working spares are available, broken DUTs may be left 47 in the pool to keep the pool at the target COUNT. 48 49 When reducing pool size, working DUTs will be returned after broken 50 DUTs, if it's necessary to achieve the target COUNT. 51 52 If the selected target POOL is for a Freon board, *and* the selected 53 spare pool has no DUTs (in any state), *and* the corresponding 54 non-Freon spare pool is populated, then the non-Freon pool will 55 be used for the Freon board. A similar rule applies to balancing 56 non-Freon boards when there is an available Freon spare pool. 57 58 """ 59 60 61 import argparse 62 import sys 63 import time 64 65 import common 66 from autotest_lib.server import frontend 67 from autotest_lib.site_utils import host_label_utils 68 from autotest_lib.site_utils import status_history 69 from autotest_lib.site_utils.suite_scheduler import constants 70 71 from chromite.lib import parallel 72 73 74 _POOL_PREFIX = constants.Labels.POOL_PREFIX 75 _BOARD_PREFIX = constants.Labels.BOARD_PREFIX 76 77 _FREON_BOARD_TAG = 'freon' 78 79 80 def _log_message(message, *args): 81 """Log a message with optional format arguments to stdout. 82 83 This function logs a single line to stdout, with formatting 84 if necessary, and without adornments. 85 86 If `*args` are supplied, the message will be formatted using 87 the arguments. 88 89 @param message Message to be logged, possibly after formatting. 90 @param args Format arguments. If empty, the message is logged 91 without formatting. 92 93 """ 94 if args: 95 message = message % args 96 sys.stdout.write('%s\n' % message) 97 98 99 def _log_info(dry_run, message, *args): 100 """Log information in a dry-run dependent fashion. 101 102 This function logs a single line to stdout, with formatting 103 if necessary. When logging for a dry run, the message is 104 printed as a shell comment, rather than as unadorned text. 105 106 If `*args` are supplied, the message will be formatted using 107 the arguments. 108 109 @param message Message to be logged, possibly after formatting. 110 @param args Format arguments. If empty, the message is logged 111 without formatting. 112 113 """ 114 if dry_run: 115 message = '# ' + message 116 _log_message(message, *args) 117 118 119 def _log_error(message, *args): 120 """Log an error to stderr, with optional format arguments. 121 122 This function logs a single line to stderr, prefixed to indicate 123 that it is an error message. 124 125 If `*args` are supplied, the message will be formatted using 126 the arguments. 127 128 @param message Message to be logged, possibly after formatting. 129 @param args Format arguments. If empty, the message is logged 130 without formatting. 131 132 """ 133 if args: 134 message = message % args 135 sys.stderr.write('ERROR: %s\n' % message) 136 137 138 class _DUTPool(object): 139 """Information about a pool of DUTs for a given board. 140 141 This class collects information about all DUTs for a given 142 board and pool pair, and divides them into three categories: 143 + Working - the DUT is working for testing, and not locked. 144 + Broken - the DUT is unable to run tests, or it is locked. 145 + Ineligible - the DUT is not available to be removed from 146 this pool. The DUT may be either working or broken. 147 148 DUTs with more than one pool: label are ineligible for exchange 149 during balancing. This is done for the sake of chameleon hosts, 150 which must always be assigned to pool:suites. These DUTs are 151 always marked with pool:chameleon to prevent their reassignment. 152 153 TODO(jrbarnette): The use of `pool:chamelon` (instead of just 154 the `chameleon` label is a hack that should be eliminated. 155 156 _DUTPool instances are used to track both main pools that need 157 to be resupplied with working DUTs and spare pools that supply 158 those DUTs. 159 160 @property board Name of the board associated with 161 this pool of DUTs. 162 @property pool Name of the pool associated with 163 this pool of DUTs. 164 @property working_hosts The list of this pool's working 165 DUTs. 166 @property broken_hosts The list of this pool's broken 167 DUTs. 168 @property ineligible_hosts The list of this pool's ineligible DUTs. 169 @property labels A list of labels that identify a DUT 170 as part of this pool. 171 @property total_hosts The total number of hosts in pool. 172 173 """ 174 175 176 @staticmethod 177 def _get_platform_label(board): 178 """Return the platform label associated with `board`. 179 180 When swapping between freon and non-freon boards, the 181 platform label must also change (because wmatrix reports 182 build results against platform labels, not boards). So, we 183 must be able to get the platform label from the board name. 184 185 For non-freon boards, the platform label is based on a name 186 assigned by the firmware, which in some cases is different 187 from the board name. For freon boards, the platform label 188 is always the board name. 189 190 @param board The board name to convert to a platform label. 191 @return The platform label for the given board name. 192 193 """ 194 if board.endswith(_FREON_BOARD_TAG): 195 return board 196 if board.startswith('x86-'): 197 return board[len('x86-') :] 198 platform_map = { 199 'daisy': 'snow', 200 'daisy_spring': 'spring', 201 'daisy_skate': 'skate', 202 'parrot_ivb': 'parrot_2', 203 'falco_li': 'falco' 204 } 205 return platform_map.get(board, board) 206 207 208 @staticmethod 209 def _freon_board_toggle(board): 210 """Toggle a board name between freon and non-freon. 211 212 For boards naming a freon build, return the name of the 213 associated non-freon board. For boards naming non-freon 214 builds, return the name of the associated freon board. 215 216 @param board The board name to be toggled. 217 @return A new board name, toggled for freon. 218 219 """ 220 if board.endswith(_FREON_BOARD_TAG): 221 # The actual board name ends with either "-freon" or 222 # "_freon", so we have to strip off one extra character. 223 return board[: -len(_FREON_BOARD_TAG) - 1] 224 else: 225 # The actual board name will end with either "-freon" or 226 # "_freon"; we have to figure out which one to use. 227 joiner = '_' 228 if joiner in board: 229 joiner = '-' 230 return joiner.join([board, _FREON_BOARD_TAG]) 231 232 233 def __init__(self, afe, board, pool, start_time, end_time, 234 use_freon=False): 235 self.board = board 236 self.pool = pool 237 self.working_hosts = [] 238 self.broken_hosts = [] 239 self.ineligible_hosts = [] 240 self.total_hosts = self._get_hosts( 241 afe, start_time, end_time, use_freon) 242 self.labels = set([_BOARD_PREFIX + self.board, 243 self._get_platform_label(self.board), 244 _POOL_PREFIX + self.pool]) 245 246 247 def _get_hosts(self, afe, start_time, end_time, use_freon): 248 all_histories = ( 249 status_history.HostJobHistory.get_multiple_histories( 250 afe, start_time, end_time, 251 board=self.board, pool=self.pool)) 252 if not all_histories and use_freon: 253 alternate_board = self._freon_board_toggle(self.board) 254 alternate_histories = ( 255 status_history.HostJobHistory.get_multiple_histories( 256 afe, start_time, end_time, 257 board=alternate_board, pool=self.pool)) 258 if alternate_histories: 259 self.board = alternate_board 260 all_histories = alternate_histories 261 for h in all_histories: 262 host = h.host 263 host_pools = [l for l in host.labels 264 if l.startswith(_POOL_PREFIX)] 265 if len(host_pools) != 1: 266 self.ineligible_hosts.append(host) 267 else: 268 diag = h.last_diagnosis()[0] 269 if (diag == status_history.WORKING and 270 not host.locked): 271 self.working_hosts.append(host) 272 else: 273 self.broken_hosts.append(host) 274 return len(all_histories) 275 276 277 @property 278 def pool_labels(self): 279 """Return the AFE labels that identify this pool. 280 281 The returned labels are the labels that must be removed 282 to remove a DUT from the pool, or added to add a DUT. 283 284 @return A list of AFE labels suitable for AFE.add_labels() 285 or AFE.remove_labels(). 286 287 """ 288 return self.labels 289 290 def calculate_spares_needed(self, target_total): 291 """Calculate and log the spares needed to achieve a target. 292 293 Return how many working spares are needed to achieve the 294 given `target_total` with all DUTs working. 295 296 The spares count may be positive or negative. Positive 297 values indicate spares are needed to replace broken DUTs in 298 order to reach the target; negative numbers indicate that 299 no spares are needed, and that a corresponding number of 300 working devices can be returned. 301 302 If the new target total would require returning ineligible 303 DUTs, an error is logged, and the target total is adjusted 304 so that those DUTs are not exchanged. 305 306 @param target_total The new target pool size. 307 308 @return The number of spares needed. 309 310 """ 311 num_ineligible = len(self.ineligible_hosts) 312 if target_total < num_ineligible: 313 _log_error('%s %s pool: Target of %d is below ' 314 'minimum of %d DUTs.', 315 self.board, self.pool, 316 target_total, num_ineligible) 317 _log_error('Adjusting target to %d DUTs.', num_ineligible) 318 target_total = num_ineligible 319 adjustment = target_total - self.total_hosts 320 return len(self.broken_hosts) + adjustment 321 322 def allocate_surplus(self, num_broken): 323 """Allocate a list DUTs that can returned as surplus. 324 325 Return a list of devices that can be returned in order to 326 reduce this pool's supply. Broken DUTs will be preferred 327 over working ones. 328 329 The `num_broken` parameter indicates the number of broken 330 DUTs to be left in the pool. If this number exceeds the 331 number of broken DUTs actually in the pool, the returned 332 list will be empty. If this number is negative, it 333 indicates a number of working DUTs to be returned in 334 addition to all broken ones. 335 336 @param num_broken Total number of broken DUTs to be left in 337 this pool. 338 339 @return A list of DUTs to be returned as surplus. 340 341 """ 342 if num_broken >= 0: 343 surplus = self.broken_hosts[num_broken:] 344 return surplus 345 else: 346 return (self.broken_hosts + 347 self.working_hosts[:-num_broken]) 348 349 350 def _exchange_labels(dry_run, hosts, target_pool, spare_pool): 351 """Reassign a list of DUTs from one pool to another. 352 353 For all the given hosts, remove all labels associated with 354 `spare_pool`, and add the labels for `target_pool`. 355 356 If `dry_run` is true, perform no changes, but log the `atest` 357 commands needed to accomplish the necessary label changes. 358 359 @param dry_run Whether the logging is for a dry run or 360 for actual execution. 361 @param hosts List of DUTs (AFE hosts) to be reassigned. 362 @param target_pool The `_DUTPool` object from which the hosts 363 are drawn. 364 @param spare_pool The `_DUTPool` object to which the hosts 365 will be added. 366 367 """ 368 if not hosts: 369 return 370 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 371 len(hosts), spare_pool.pool, target_pool.pool) 372 additions = target_pool.pool_labels 373 removals = spare_pool.pool_labels 374 intersection = additions & removals 375 additions -= intersection 376 removals -= intersection 377 for host in hosts: 378 if not dry_run: 379 _log_message('Updating host: %s.', host.hostname) 380 host.remove_labels(list(removals)) 381 host.add_labels(list(additions)) 382 else: 383 _log_message('atest label remove -m %s %s', 384 host.hostname, ' '.join(removals)) 385 _log_message('atest label add -m %s %s', 386 host.hostname, ' '.join(additions)) 387 388 389 def _balance_board(arguments, afe, board, start_time, end_time): 390 """Balance one board as requested by command line arguments. 391 392 @param arguments Parsed command line arguments. 393 @param dry_run Whether the logging is for a dry run or 394 for actual execution. 395 @param afe AFE object to be used for the changes. 396 @param board Board to be balanced. 397 @param start_time Start time for HostJobHistory objects in 398 the DUT pools. 399 @param end_time End time for HostJobHistory objects in the 400 DUT pools. 401 402 """ 403 spare_pool = _DUTPool(afe, board, arguments.spare, 404 start_time, end_time, use_freon=True) 405 main_pool = _DUTPool(afe, board, arguments.pool, 406 start_time, end_time) 407 408 target_total = main_pool.total_hosts 409 if arguments.total is not None: 410 target_total = arguments.total 411 elif arguments.grow: 412 target_total += arguments.grow 413 elif arguments.shrink: 414 target_total -= arguments.shrink 415 416 spares_needed = main_pool.calculate_spares_needed(target_total) 417 if spares_needed > 0: 418 spare_duts = spare_pool.working_hosts[:spares_needed] 419 shortfall = spares_needed - len(spare_duts) 420 else: 421 spare_duts = [] 422 shortfall = spares_needed 423 424 surplus_duts = main_pool.allocate_surplus(shortfall) 425 426 if spares_needed or surplus_duts or arguments.verbose: 427 dry_run = arguments.dry_run 428 _log_message('') 429 430 _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool) 431 _log_info(dry_run, 432 'Total %d DUTs, %d working, %d broken, %d reserved.', 433 main_pool.total_hosts, len(main_pool.working_hosts), 434 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 435 436 if spares_needed > 0: 437 add_msg = 'grow pool by %d DUTs' % spares_needed 438 elif spares_needed < 0: 439 add_msg = 'shrink pool by %d DUTs' % -spares_needed 440 else: 441 add_msg = 'no change to pool size' 442 _log_info(dry_run, 'Target is %d working DUTs; %s.', 443 target_total, add_msg) 444 445 _log_info(dry_run, 446 '%s %s pool has %d spares available.', 447 board, main_pool.pool, len(spare_pool.working_hosts)) 448 449 if spares_needed > len(spare_duts): 450 _log_error('Not enough spares: need %d, only have %d.', 451 spares_needed, len(spare_duts)) 452 elif shortfall >= 0: 453 _log_info(dry_run, 454 '%s %s pool will return %d broken DUTs, ' 455 'leaving %d still in the pool.', 456 board, main_pool.pool, 457 len(surplus_duts), 458 len(main_pool.broken_hosts) - len(surplus_duts)) 459 else: 460 _log_info(dry_run, 461 '%s %s pool will return %d surplus DUTs, ' 462 'including %d working DUTs.', 463 board, main_pool.pool, 464 len(main_pool.broken_hosts) - shortfall, 465 -shortfall) 466 467 if (len(main_pool.broken_hosts) > arguments.max_broken and 468 not arguments.force_rebalance): 469 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 470 board, main_pool.pool, len(main_pool.broken_hosts)) 471 _log_error('Please investigate this board to see if there is a bug ') 472 _log_error('that is bricking devices. Once you have finished your ') 473 _log_error('investigation, you can force a rebalance with ') 474 _log_error('--force-rebalance') 475 return 476 477 if not spare_duts and not surplus_duts: 478 if arguments.verbose: 479 _log_info(arguments.dry_run, 'No exchange required.') 480 return 481 482 _exchange_labels(arguments.dry_run, surplus_duts, 483 spare_pool, main_pool) 484 _exchange_labels(arguments.dry_run, spare_duts, 485 main_pool, spare_pool) 486 487 488 def _parse_command(argv): 489 """Parse the command line arguments. 490 491 Create an argument parser for this command's syntax, parse the 492 command line, and return the result of the `ArgumentParser` 493 `parse_args()` method. 494 495 @param argv Standard command line argument vector; `argv[0]` is 496 assumed to be the command name. 497 498 @return Result returned by `ArgumentParser.parse_args()`. 499 500 """ 501 parser = argparse.ArgumentParser( 502 prog=argv[0], 503 description='Balance pool shortages from spares on reserve') 504 505 count_group = parser.add_mutually_exclusive_group() 506 count_group.add_argument('-t', '--total', type=int, 507 metavar='COUNT', default=None, 508 help='Set the number of DUTs in the ' 509 'pool to the specified count for ' 510 'every BOARD') 511 count_group.add_argument('-a', '--grow', type=int, 512 metavar='COUNT', default=None, 513 help='Add the specified number of DUTs ' 514 'to the pool for every BOARD') 515 count_group.add_argument('-d', '--shrink', type=int, 516 metavar='COUNT', default=None, 517 help='Remove the specified number of DUTs ' 518 'from the pool for every BOARD') 519 520 parser.add_argument('-s', '--spare', default='suites', 521 metavar='POOL', 522 help='Pool from which to draw replacement ' 523 'spares (default: pool:suites)') 524 parser.add_argument('-n', '--dry-run', action='store_true', 525 help='Report actions to take in the form of ' 526 'shell commands') 527 parser.add_argument('-v', '--verbose', action='store_true', 528 help='Print more detail about calculations for debug ' 529 'purposes.') 530 531 parser.add_argument('-m', '--max-broken', default=2, type=int, 532 metavar='COUNT', 533 help='Only rebalance a pool if it has at most ' 534 'COUNT broken DUTs.') 535 parser.add_argument('-f', '--force-rebalance', action='store_true', 536 help='Forcefully rebalance all DUTs in a pool, even ' 537 'if it has a large number of broken DUTs. ' 538 'Before doing this, please investigate whether ' 539 'there is a bug that is bricking devices in the ' 540 'lab.') 541 542 parser.add_argument('--all-boards', action='store_true', 543 help='Rebalance all boards.') 544 545 parser.add_argument('pool', 546 metavar='POOL', 547 help='Name of the pool to balance.') 548 parser.add_argument('boards', nargs='*', 549 metavar='BOARD', 550 help='Names of boards to balance.') 551 552 arguments = parser.parse_args(argv[1:]) 553 554 # Error-check arguments. 555 if not arguments.boards and not arguments.all_boards: 556 parser.error('No boards specified. To balance all boards, use ' 557 '--all-boards') 558 if arguments.boards and arguments.all_boards: 559 parser.error('Cannot specify boards with --all-boards.') 560 561 return arguments 562 563 564 def main(argv): 565 """Standard main routine. 566 567 @param argv Command line arguments including `sys.argv[0]`. 568 569 """ 570 def balancer(i, board): 571 """Balance the specified board. 572 573 @param i The index of the board. 574 @param board The board name. 575 """ 576 if i > 0: 577 _log_message('') 578 _balance_board(arguments, afe, board, start_time, end_time) 579 580 arguments = _parse_command(argv) 581 end_time = time.time() 582 start_time = end_time - 24 * 60 * 60 583 afe = frontend.AFE(server=None) 584 boards = arguments.boards 585 if arguments.all_boards: 586 boards = host_label_utils.get_all_boards( 587 labels=[_POOL_PREFIX + arguments.pool]) 588 board_args = list(enumerate(boards)) 589 try: 590 parallel.RunTasksInProcessPool(balancer, board_args, processes=8) 591 except KeyboardInterrupt: 592 pass 593 594 595 if __name__ == '__main__': 596 main(sys.argv) 597