1 #!/usr/bin/env python 2 # Copyright 2015 The Chromium OS Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """Adjust pool balances to cover DUT shortfalls. 7 8 This command takes all broken DUTs in a specific pool for specific 9 models and swaps them with working DUTs taken from a selected pool 10 of spares. The command is meant primarily for replacing broken DUTs 11 in critical pools like BVT or CQ, but it can also be used to adjust 12 pool sizes, or to create or remove pools. 13 14 usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] 15 16 positional arguments: 17 POOL Name of the pool to balance 18 MODEL Names of models to balance 19 20 optional arguments: 21 -h, --help show this help message and exit 22 -t COUNT, --total COUNT 23 Set the number of DUTs in the pool to the specified 24 count for every MODEL 25 -a COUNT, --grow COUNT 26 Add the specified number of DUTs to the pool for every 27 MODEL 28 -d COUNT, --shrink COUNT 29 Remove the specified number of DUTs from the pool for 30 every MODEL 31 -s POOL, --spare POOL 32 Pool from which to draw replacement spares (default: 33 pool:suites) 34 --sku SKU The specific SKU we intend to swap with 35 -n, --dry-run Report actions to take in the form of shell commands 36 37 38 The command attempts to remove all broken DUTs from the target POOL 39 for every MODEL, and replace them with enough working DUTs taken 40 from the spare pool to bring the strength of POOL to the requested 41 total COUNT. 42 43 If no COUNT options are supplied (i.e. there are no --total, --grow, 44 or --shrink options), the command will maintain the current totals of 45 DUTs for every MODEL in the target POOL. 46 47 If not enough working spares are available, broken DUTs may be left 48 in the pool to keep the pool at the target COUNT. 49 50 When reducing pool size, working DUTs will be returned after broken 51 DUTs, if it's necessary to achieve the target COUNT. 52 53 """ 54 55 56 import argparse 57 import sys 58 import time 59 60 import common 61 from autotest_lib.server import constants 62 from autotest_lib.server import frontend 63 from autotest_lib.server import site_utils 64 from autotest_lib.server.lib import status_history 65 from autotest_lib.site_utils import lab_inventory 66 from autotest_lib.utils import labellib 67 from chromite.lib import metrics 68 from chromite.lib import parallel 69 70 #This must be imported after chromite.lib.metrics 71 from infra_libs import ts_mon 72 73 _POOL_PREFIX = constants.Labels.POOL_PREFIX 74 # This is the ratio of all models we should calculate the default max 75 # number of broken models against. It seemed like the best choice that 76 # was neither too strict nor lax. 77 _MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 78 79 _ALL_CRITICAL_POOLS = 'all_critical_pools' 80 _SPARE_DEFAULT = lab_inventory.SPARE_POOL 81 82 83 def _log_message(message, *args): 84 """Log a message with optional format arguments to stdout. 85 86 This function logs a single line to stdout, with formatting 87 if necessary, and without adornments. 88 89 If `*args` are supplied, the message will be formatted using 90 the arguments. 91 92 @param message Message to be logged, possibly after formatting. 93 @param args Format arguments. If empty, the message is logged 94 without formatting. 95 96 """ 97 if args: 98 message = message % args 99 sys.stdout.write('%s\n' % message) 100 101 102 def _log_info(dry_run, message, *args): 103 """Log information in a dry-run dependent fashion. 104 105 This function logs a single line to stdout, with formatting 106 if necessary. When logging for a dry run, the message is 107 printed as a shell comment, rather than as unadorned text. 108 109 If `*args` are supplied, the message will be formatted using 110 the arguments. 111 112 @param message Message to be logged, possibly after formatting. 113 @param args Format arguments. If empty, the message is logged 114 without formatting. 115 116 """ 117 if dry_run: 118 message = '# ' + message 119 _log_message(message, *args) 120 121 122 def _log_error(message, *args): 123 """Log an error to stderr, with optional format arguments. 124 125 This function logs a single line to stderr, prefixed to indicate 126 that it is an error message. 127 128 If `*args` are supplied, the message will be formatted using 129 the arguments. 130 131 @param message Message to be logged, possibly after formatting. 132 @param args Format arguments. If empty, the message is logged 133 without formatting. 134 135 """ 136 if args: 137 message = message % args 138 sys.stderr.write('ERROR: %s\n' % message) 139 140 141 class _DUTPool(object): 142 """Information about a pool of DUTs matching given labels. 143 144 This class collects information about all DUTs for a given pool and matching 145 the given labels, and divides them into three categories: 146 + Working - the DUT is working for testing, and not locked. 147 + Broken - the DUT is unable to run tests, or it is locked. 148 + Ineligible - the DUT is not available to be removed from this pool. The 149 DUT may be either working or broken. 150 151 DUTs with more than one pool: label are ineligible for exchange 152 during balancing. This is done for the sake of chameleon hosts, 153 which must always be assigned to pool:suites. These DUTs are 154 always marked with pool:chameleon to prevent their reassignment. 155 156 TODO(jrbarnette): The use of `pool:chamelon` (instead of just 157 the `chameleon` label is a hack that should be eliminated. 158 159 _DUTPool instances are used to track both main pools that need 160 to be resupplied with working DUTs and spare pools that supply 161 those DUTs. 162 163 @property pool Name of the pool associated with 164 this pool of DUTs. 165 @property labels Labels that constrain the DUTs to consider. 166 @property working_hosts The list of this pool's working DUTs. 167 @property broken_hosts The list of this pool's broken DUTs. 168 @property ineligible_hosts The list of this pool's ineligible DUTs. 169 @property pool_labels A list of labels that identify a DUT as part 170 of this pool. 171 @property total_hosts The total number of hosts in pool. 172 173 """ 174 175 def __init__(self, afe, pool, labels, start_time, end_time): 176 self.pool = pool 177 self.labels = labellib.LabelsMapping(labels) 178 self.labels['pool'] = pool 179 self._pool_labels = [_POOL_PREFIX + self.pool] 180 181 self.working_hosts = [] 182 self.broken_hosts = [] 183 self.ineligible_hosts = [] 184 self.total_hosts = self._get_hosts(afe, start_time, end_time) 185 186 187 def _get_hosts(self, afe, start_time, end_time): 188 all_histories = status_history.HostJobHistory.get_multiple_histories( 189 afe, start_time, end_time, self.labels.getlabels()) 190 for h in all_histories: 191 host = h.host 192 host_pools = [l for l in host.labels 193 if l.startswith(_POOL_PREFIX)] 194 if len(host_pools) != 1: 195 self.ineligible_hosts.append(host) 196 else: 197 diag = h.last_diagnosis()[0] 198 if (diag == status_history.WORKING and 199 not host.locked): 200 self.working_hosts.append(host) 201 else: 202 self.broken_hosts.append(host) 203 return len(all_histories) 204 205 206 @property 207 def pool_labels(self): 208 """Return the AFE labels that identify this pool. 209 210 The returned labels are the labels that must be removed 211 to remove a DUT from the pool, or added to add a DUT. 212 213 @return A list of AFE labels suitable for AFE.add_labels() 214 or AFE.remove_labels(). 215 216 """ 217 return self._pool_labels 218 219 def calculate_spares_needed(self, target_total): 220 """Calculate and log the spares needed to achieve a target. 221 222 Return how many working spares are needed to achieve the 223 given `target_total` with all DUTs working. 224 225 The spares count may be positive or negative. Positive 226 values indicate spares are needed to replace broken DUTs in 227 order to reach the target; negative numbers indicate that 228 no spares are needed, and that a corresponding number of 229 working devices can be returned. 230 231 If the new target total would require returning ineligible 232 DUTs, an error is logged, and the target total is adjusted 233 so that those DUTs are not exchanged. 234 235 @param target_total The new target pool size. 236 237 @return The number of spares needed. 238 239 """ 240 num_ineligible = len(self.ineligible_hosts) 241 spares_needed = target_total >= num_ineligible 242 metrics.Boolean( 243 'chromeos/autotest/balance_pools/exhausted_pools', 244 'True for each pool/model which requests more DUTs than supplied', 245 # TODO(jrbarnette) The 'board' field is a legacy. We need 246 # to leave it here until we do the extra work Monarch 247 # requires to delete a field. 248 field_spec=[ 249 ts_mon.StringField('pool'), 250 ts_mon.StringField('board'), 251 ts_mon.StringField('model'), 252 ]).set( 253 not spares_needed, 254 fields={ 255 'pool': self.pool, 256 'board': self.labels.get('model', ''), 257 'model': self.labels.get('model', ''), 258 }, 259 ) 260 if not spares_needed: 261 _log_error( 262 '%s pool (%s): Target of %d is below minimum of %d DUTs.', 263 self.pool, self.labels, target_total, num_ineligible, 264 ) 265 _log_error('Adjusting target to %d DUTs.', num_ineligible) 266 target_total = num_ineligible 267 else: 268 _log_message('%s %s pool: Target of %d is above minimum.', 269 self.labels.get('model', ''), self.pool, target_total) 270 adjustment = target_total - self.total_hosts 271 return len(self.broken_hosts) + adjustment 272 273 def allocate_surplus(self, num_broken): 274 """Allocate a list DUTs that can returned as surplus. 275 276 Return a list of devices that can be returned in order to 277 reduce this pool's supply. Broken DUTs will be preferred 278 over working ones. 279 280 The `num_broken` parameter indicates the number of broken 281 DUTs to be left in the pool. If this number exceeds the 282 number of broken DUTs actually in the pool, the returned 283 list will be empty. If this number is negative, it 284 indicates a number of working DUTs to be returned in 285 addition to all broken ones. 286 287 @param num_broken Total number of broken DUTs to be left in 288 this pool. 289 290 @return A list of DUTs to be returned as surplus. 291 292 """ 293 if num_broken >= 0: 294 surplus = self.broken_hosts[num_broken:] 295 return surplus 296 else: 297 return (self.broken_hosts + 298 self.working_hosts[:-num_broken]) 299 300 301 def _exchange_labels(dry_run, hosts, target_pool, spare_pool): 302 """Reassign a list of DUTs from one pool to another. 303 304 For all the given hosts, remove all labels associated with 305 `spare_pool`, and add the labels for `target_pool`. 306 307 If `dry_run` is true, perform no changes, but log the `atest` 308 commands needed to accomplish the necessary label changes. 309 310 @param dry_run Whether the logging is for a dry run or 311 for actual execution. 312 @param hosts List of DUTs (AFE hosts) to be reassigned. 313 @param target_pool The `_DUTPool` object from which the hosts 314 are drawn. 315 @param spare_pool The `_DUTPool` object to which the hosts 316 will be added. 317 318 """ 319 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 320 len(hosts), spare_pool.pool, target_pool.pool) 321 metrics.Counter( 322 'chromeos/autotest/balance_pools/duts_moved', 323 'DUTs transferred between pools', 324 # TODO(jrbarnette) The 'board' field is a legacy. We need to 325 # leave it here until we do the extra work Monarch requires to 326 # delete a field. 327 field_spec=[ 328 ts_mon.StringField('board'), 329 ts_mon.StringField('model'), 330 ts_mon.StringField('source_pool'), 331 ts_mon.StringField('target_pool'), 332 ] 333 ).increment_by( 334 len(hosts), 335 fields={ 336 'board': target_pool.labels.get('model', ''), 337 'model': target_pool.labels.get('model', ''), 338 'source_pool': spare_pool.pool, 339 'target_pool': target_pool.pool, 340 }, 341 ) 342 if not hosts: 343 return 344 345 additions = target_pool.pool_labels 346 removals = spare_pool.pool_labels 347 for host in hosts: 348 if not dry_run: 349 _log_message('Updating host: %s.', host.hostname) 350 host.remove_labels(removals) 351 host.add_labels(additions) 352 else: 353 _log_message('atest label remove -m %s %s', 354 host.hostname, ' '.join(removals)) 355 _log_message('atest label add -m %s %s', 356 host.hostname, ' '.join(additions)) 357 358 359 def _balance_model(arguments, afe, pool, labels, start_time, end_time): 360 """Balance one model as requested by command line arguments. 361 362 @param arguments Parsed command line arguments. 363 @param afe AFE object to be used for the changes. 364 @param pool Pool of the model to be balanced. 365 @param labels Restrict the balancing operation within DUTs 366 that have these labels. 367 @param start_time Start time for HostJobHistory objects in 368 the DUT pools. 369 @param end_time End time for HostJobHistory objects in the 370 DUT pools. 371 372 """ 373 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) 374 main_pool = _DUTPool(afe, pool, labels, start_time, end_time) 375 376 target_total = main_pool.total_hosts 377 if arguments.total is not None: 378 target_total = arguments.total 379 elif arguments.grow: 380 target_total += arguments.grow 381 elif arguments.shrink: 382 target_total -= arguments.shrink 383 384 spares_needed = main_pool.calculate_spares_needed(target_total) 385 if spares_needed > 0: 386 spare_duts = spare_pool.working_hosts[:spares_needed] 387 shortfall = spares_needed - len(spare_duts) 388 else: 389 spare_duts = [] 390 shortfall = spares_needed 391 392 surplus_duts = main_pool.allocate_surplus(shortfall) 393 394 if spares_needed or surplus_duts or arguments.verbose: 395 dry_run = arguments.dry_run 396 _log_message('') 397 398 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) 399 _log_info(dry_run, 400 'Total %d DUTs, %d working, %d broken, %d reserved.', 401 main_pool.total_hosts, len(main_pool.working_hosts), 402 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 403 404 if spares_needed > 0: 405 add_msg = 'grow pool by %d DUTs' % spares_needed 406 elif spares_needed < 0: 407 add_msg = 'shrink pool by %d DUTs' % -spares_needed 408 else: 409 add_msg = 'no change to pool size' 410 _log_info(dry_run, 'Target is %d working DUTs; %s.', 411 target_total, add_msg) 412 413 _log_info(dry_run, 414 '%s %s pool has %d spares available for balancing pool %s', 415 labels, spare_pool.pool, len(spare_pool.working_hosts), 416 main_pool.pool) 417 418 if spares_needed > len(spare_duts): 419 _log_error('Not enough spares: need %d, only have %d.', 420 spares_needed, len(spare_duts)) 421 elif shortfall >= 0: 422 _log_info(dry_run, 423 '%s %s pool will return %d broken DUTs, ' 424 'leaving %d still in the pool.', 425 labels, main_pool.pool, 426 len(surplus_duts), 427 len(main_pool.broken_hosts) - len(surplus_duts)) 428 else: 429 _log_info(dry_run, 430 '%s %s pool will return %d surplus DUTs, ' 431 'including %d working DUTs.', 432 labels, main_pool.pool, 433 len(main_pool.broken_hosts) - shortfall, 434 -shortfall) 435 436 if (len(main_pool.broken_hosts) > arguments.max_broken and 437 not arguments.force_rebalance): 438 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 439 labels, main_pool.pool, len(main_pool.broken_hosts)) 440 _log_error('Please investigate this model to for a bug ') 441 _log_error('that is bricking devices. Once you have finished your ') 442 _log_error('investigation, you can force a rebalance with ') 443 _log_error('--force-rebalance') 444 spare_duts = [] 445 surplus_duts = [] 446 447 if not spare_duts and not surplus_duts: 448 if arguments.verbose: 449 _log_info(arguments.dry_run, 'No exchange required.') 450 451 _exchange_labels(arguments.dry_run, surplus_duts, 452 spare_pool, main_pool) 453 _exchange_labels(arguments.dry_run, spare_duts, 454 main_pool, spare_pool) 455 456 457 def _too_many_broken(inventory, pool, args): 458 """ 459 Get the inventory of models and check if too many are broken. 460 461 @param inventory: _LabInventory object. 462 @param pool: The pool to check. 463 @param args: Parsed command line arguments. 464 465 @return True if the number of models with 1 or more broken duts 466 exceed max_broken_models, False otherwise. 467 """ 468 # Were we asked to skip this check? 469 if (args.force_rebalance or 470 (args.all_models and args.max_broken_models == 0)): 471 return False 472 473 max_broken = args.max_broken_models 474 if max_broken is None: 475 total_num = len(inventory.get_pool_models(pool)) 476 max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num) 477 _log_info(args.dry_run, 478 'Max broken models for pool %s: %d', 479 pool, max_broken) 480 481 broken = [model for model, counts in inventory.iteritems() 482 if counts.get_broken(pool) != 0] 483 _log_message('There are %d models in the %s pool with at least 1 ' 484 'broken DUT (max threshold %d)', 485 len(broken), pool, max_broken) 486 for b in sorted(broken): 487 _log_message(b) 488 return len(broken) > max_broken 489 490 491 def _parse_command(argv): 492 """Parse the command line arguments. 493 494 Create an argument parser for this command's syntax, parse the 495 command line, and return the result of the `ArgumentParser` 496 `parse_args()` method. 497 498 @param argv Standard command line argument vector; `argv[0]` is 499 assumed to be the command name. 500 501 @return Result returned by `ArgumentParser.parse_args()`. 502 503 """ 504 parser = argparse.ArgumentParser( 505 prog=argv[0], 506 description='Balance pool shortages from spares on reserve') 507 508 parser.add_argument( 509 '-w', '--web', type=str, default=None, 510 help='AFE host to use. Default comes from shadow_config.', 511 ) 512 count_group = parser.add_mutually_exclusive_group() 513 count_group.add_argument('-t', '--total', type=int, 514 metavar='COUNT', default=None, 515 help='Set the number of DUTs in the ' 516 'pool to the specified count for ' 517 'every MODEL') 518 count_group.add_argument('-a', '--grow', type=int, 519 metavar='COUNT', default=None, 520 help='Add the specified number of DUTs ' 521 'to the pool for every MODEL') 522 count_group.add_argument('-d', '--shrink', type=int, 523 metavar='COUNT', default=None, 524 help='Remove the specified number of DUTs ' 525 'from the pool for every MODEL') 526 527 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, 528 metavar='POOL', 529 help='Pool from which to draw replacement ' 530 'spares (default: pool:%s)' % _SPARE_DEFAULT) 531 parser.add_argument('-n', '--dry-run', action='store_true', 532 help='Report actions to take in the form of ' 533 'shell commands') 534 parser.add_argument('-v', '--verbose', action='store_true', 535 help='Print more detail about calculations for debug ' 536 'purposes.') 537 538 parser.add_argument('-m', '--max-broken', default=2, type=int, 539 metavar='COUNT', 540 help='Only rebalance a pool if it has at most ' 541 'COUNT broken DUTs.') 542 parser.add_argument('-f', '--force-rebalance', action='store_true', 543 help='Forcefully rebalance all DUTs in a pool, even ' 544 'if it has a large number of broken DUTs. ' 545 'Before doing this, please investigate whether ' 546 'there is a bug that is bricking devices in the ' 547 'lab.') 548 parser.add_argument('--production', action='store_true', 549 help='Treat this as a production run. This will ' 550 'collect metrics.') 551 552 parser.add_argument( 553 '--all-models', 554 action='store_true', 555 help='Rebalance all managed models. This will do a very expensive ' 556 'check to see how many models have at least one broken DUT. ' 557 'To bypass that check, set --max-broken-models to 0.', 558 ) 559 parser.add_argument( 560 '--max-broken-models', default=None, type=int, metavar='COUNT', 561 help='Only rebalance all models if number of models with broken ' 562 'DUTs in the specified pool is less than COUNT.', 563 ) 564 565 parser.add_argument('pool', 566 metavar='POOL', 567 help='Name of the pool to balance. Use %s to balance ' 568 'all critical pools' % _ALL_CRITICAL_POOLS) 569 parser.add_argument('models', nargs='*', metavar='MODEL', 570 help='Names of models to balance.') 571 572 parser.add_argument('--sku', type=str, 573 help='Optional name of sku to restrict to.') 574 575 arguments = parser.parse_args(argv[1:]) 576 577 # Error-check arguments. 578 if arguments.models and arguments.all_models: 579 parser.error('Cannot specify individual models on the command line ' 580 'when using --all-models.') 581 if (arguments.pool == _ALL_CRITICAL_POOLS and 582 arguments.spare != _SPARE_DEFAULT): 583 parser.error('Cannot specify --spare pool to be %s when balancing all ' 584 'critical pools.' % _SPARE_DEFAULT) 585 return arguments 586 587 588 def infer_balancer_targets(afe, arguments, pools): 589 """Take some arguments and translate them to a list of models to balance 590 591 Args: 592 @param afe AFE object to be used for taking inventory. 593 @param arguments Parsed command line arguments. 594 @param pools The list of pools to balance. 595 596 @returns a list of (model, labels) tuples to be balanced 597 598 """ 599 balancer_targets = [] 600 601 for pool in pools: 602 if arguments.all_models: 603 inventory = lab_inventory.get_inventory(afe) 604 quarantine = _too_many_broken(inventory, pool, arguments) 605 if quarantine: 606 _log_error('Refusing to balance all models for %s pool, ' 607 'too many models with at least 1 broken DUT ' 608 'detected.', pool) 609 else: 610 for model in inventory.get_models(pool): 611 labels = labellib.LabelsMapping() 612 labels['model'] = model 613 balancer_targets.append((pool, labels.getlabels())) 614 metrics.Boolean( 615 'chromeos/autotest/balance_pools/unchanged_pools').set( 616 quarantine, fields={'pool': pool}) 617 _log_message('Pool %s quarantine status: %s', pool, quarantine) 618 else: 619 for model in arguments.models: 620 labels = labellib.LabelsMapping() 621 labels['model'] = model 622 if arguments.sku: 623 labels['sku'] = arguments.sku 624 balancer_targets.append((pool, labels.getlabels())) 625 return balancer_targets 626 627 628 def main(argv): 629 """Standard main routine. 630 631 @param argv Command line arguments including `sys.argv[0]`. 632 633 """ 634 arguments = _parse_command(argv) 635 if arguments.production: 636 metrics_manager = site_utils.SetupTsMonGlobalState( 637 'balance_pools', 638 indirect=False, 639 auto_flush=False, 640 ) 641 else: 642 metrics_manager = site_utils.TrivialContextManager() 643 644 with metrics_manager: 645 end_time = time.time() 646 start_time = end_time - 24 * 60 * 60 647 afe = frontend.AFE(server=arguments.web) 648 649 def balancer(pool, labels): 650 """Balance the specified model. 651 652 @param pool: The pool to rebalance for the model. 653 @param labels: labels to restrict to balancing operations 654 within. 655 """ 656 _balance_model(arguments, afe, pool, labels, 657 start_time, end_time) 658 _log_message('') 659 660 pools = (lab_inventory.CRITICAL_POOLS 661 if arguments.pool == _ALL_CRITICAL_POOLS 662 else [arguments.pool]) 663 balancer_targets = infer_balancer_targets(afe, arguments, pools) 664 try: 665 parallel.RunTasksInProcessPool( 666 balancer, 667 balancer_targets, 668 processes=8, 669 ) 670 except KeyboardInterrupt: 671 pass 672 finally: 673 metrics.Flush() 674 675 676 if __name__ == '__main__': 677 main(sys.argv) 678