1 #!/usr/bin/env python 2 # Copyright 2015 The Chromium OS Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """Adjust pool balances to cover DUT shortfalls. 7 8 This command takes all broken DUTs in a specific pool for specific 9 models and swaps them with working DUTs taken from a selected pool 10 of spares. The command is meant primarily for replacing broken DUTs 11 in critical pools like BVT or CQ, but it can also be used to adjust 12 pool sizes, or to create or remove pools. 13 14 usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] 15 16 positional arguments: 17 POOL Name of the pool to balance 18 MODEL Names of models to balance 19 20 optional arguments: 21 -h, --help show this help message and exit 22 -t COUNT, --total COUNT 23 Set the number of DUTs in the pool to the specified 24 count for every MODEL 25 -a COUNT, --grow COUNT 26 Add the specified number of DUTs to the pool for every 27 MODEL 28 -d COUNT, --shrink COUNT 29 Remove the specified number of DUTs from the pool for 30 every MODEL 31 -s POOL, --spare POOL 32 Pool from which to draw replacement spares (default: 33 pool:suites) 34 -p PHASE, --phase PHASE 35 Phase to restrict the balance pool operation to 36 --sku SKU The specific SKU we intend to swap with 37 -n, --dry-run Report actions to take in the form of shell commands 38 39 40 The command attempts to remove all broken DUTs from the target POOL 41 for every MODEL, and replace them with enough working DUTs taken 42 from the spare pool to bring the strength of POOL to the requested 43 total COUNT. 44 45 If no COUNT options are supplied (i.e. there are no --total, --grow, 46 or --shrink options), the command will maintain the current totals of 47 DUTs for every MODEL in the target POOL. 48 49 If not enough working spares are available, broken DUTs may be left 50 in the pool to keep the pool at the target COUNT. 51 52 When reducing pool size, working DUTs will be returned after broken 53 DUTs, if it's necessary to achieve the target COUNT. 54 55 """ 56 57 58 import argparse 59 import os 60 import re 61 import sys 62 import time 63 64 import common 65 from autotest_lib.server import constants 66 from autotest_lib.server import site_utils 67 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68 from autotest_lib.server.lib import status_history 69 from autotest_lib.site_utils import lab_inventory 70 from autotest_lib.utils import labellib 71 from chromite.lib import metrics 72 from chromite.lib import parallel 73 74 #This must be imported after chromite.lib.metrics 75 from infra_libs import ts_mon 76 77 _POOL_PREFIX = constants.Labels.POOL_PREFIX 78 # This is the ratio of all models we should calculate the default max 79 # number of broken models against. It seemed like the best choice that 80 # was neither too strict nor lax. 81 _MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 82 83 _ALL_CRITICAL_POOLS = 'all_critical_pools' 84 _SPARE_DEFAULT = lab_inventory.SPARE_POOL 85 86 87 # _VALID_POOL_PATTERN - Regular expression matching pool names that will 88 # be accepted on the command line. 89 # 90 # Note: This pattern was selected merely to recognize all existing pool 91 # names; there's no underlying technical restriction motivating this 92 # pattern. No reasonable request to add more special characters to the 93 # allowed set should be refused. 94 95 _VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$') 96 97 98 def _log_message(message, *args): 99 """Log a message with optional format arguments to stdout. 100 101 This function logs a single line to stdout, with formatting 102 if necessary, and without adornments. 103 104 If `*args` are supplied, the message will be formatted using 105 the arguments. 106 107 @param message Message to be logged, possibly after formatting. 108 @param args Format arguments. If empty, the message is logged 109 without formatting. 110 111 """ 112 if args: 113 message = message % args 114 sys.stdout.write('%s\n' % message) 115 116 117 def _log_info(dry_run, message, *args): 118 """Log information in a dry-run dependent fashion. 119 120 This function logs a single line to stdout, with formatting 121 if necessary. When logging for a dry run, the message is 122 printed as a shell comment, rather than as unadorned text. 123 124 If `*args` are supplied, the message will be formatted using 125 the arguments. 126 127 @param message Message to be logged, possibly after formatting. 128 @param args Format arguments. If empty, the message is logged 129 without formatting. 130 131 """ 132 if dry_run: 133 message = '# ' + message 134 _log_message(message, *args) 135 136 137 def _log_error(message, *args): 138 """Log an error to stderr, with optional format arguments. 139 140 This function logs a single line to stderr, prefixed to indicate 141 that it is an error message. 142 143 If `*args` are supplied, the message will be formatted using 144 the arguments. 145 146 @param message Message to be logged, possibly after formatting. 147 @param args Format arguments. If empty, the message is logged 148 without formatting. 149 150 """ 151 if args: 152 message = message % args 153 sys.stderr.write('ERROR: %s\n' % message) 154 155 156 class _DUTPool(object): 157 """Information about a pool of DUTs matching given labels. 158 159 This class collects information about all DUTs for a given pool and matching 160 the given labels, and divides them into three categories: 161 + Working - the DUT is working for testing, and not locked. 162 + Broken - the DUT is unable to run tests, or it is locked. 163 + Ineligible - the DUT is not available to be removed from this pool. The 164 DUT may be either working or broken. 165 166 DUTs with more than one pool: label are ineligible for exchange 167 during balancing. This is done for the sake of chameleon hosts, 168 which must always be assigned to pool:suites. These DUTs are 169 always marked with pool:chameleon to prevent their reassignment. 170 171 TODO(jrbarnette): The use of `pool:chamelon` (instead of just 172 the `chameleon` label is a hack that should be eliminated. 173 174 _DUTPool instances are used to track both main pools that need 175 to be resupplied with working DUTs and spare pools that supply 176 those DUTs. 177 178 @property pool Name of the pool associated with 179 this pool of DUTs. 180 @property labels Labels that constrain the DUTs to consider. 181 @property working_hosts The list of this pool's working DUTs. 182 @property broken_hosts The list of this pool's broken DUTs. 183 @property ineligible_hosts The list of this pool's ineligible DUTs. 184 @property pool_labels A list of labels that identify a DUT as part 185 of this pool. 186 @property total_hosts The total number of hosts in pool. 187 188 """ 189 190 def __init__(self, afe, pool, labels, start_time, end_time): 191 self.pool = pool 192 self.labels = labellib.LabelsMapping(labels) 193 self.labels['pool'] = pool 194 self._pool_labels = [_POOL_PREFIX + self.pool] 195 196 self.working_hosts = [] 197 self.broken_hosts = [] 198 self.ineligible_hosts = [] 199 self.total_hosts = self._get_hosts(afe, start_time, end_time) 200 201 202 def _get_hosts(self, afe, start_time, end_time): 203 all_histories = status_history.HostJobHistory.get_multiple_histories( 204 afe, start_time, end_time, self.labels.getlabels()) 205 for h in all_histories: 206 host = h.host 207 host_pools = [l for l in host.labels 208 if l.startswith(_POOL_PREFIX)] 209 if len(host_pools) != 1: 210 self.ineligible_hosts.append(host) 211 else: 212 diag = h.last_diagnosis()[0] 213 if (diag == status_history.WORKING and 214 not host.locked): 215 self.working_hosts.append(host) 216 else: 217 self.broken_hosts.append(host) 218 return len(all_histories) 219 220 221 @property 222 def pool_labels(self): 223 """Return the AFE labels that identify this pool. 224 225 The returned labels are the labels that must be removed 226 to remove a DUT from the pool, or added to add a DUT. 227 228 @return A list of AFE labels suitable for AFE.add_labels() 229 or AFE.remove_labels(). 230 231 """ 232 return self._pool_labels 233 234 def calculate_spares_needed(self, target_total): 235 """Calculate and log the spares needed to achieve a target. 236 237 Return how many working spares are needed to achieve the 238 given `target_total` with all DUTs working. 239 240 The spares count may be positive or negative. Positive 241 values indicate spares are needed to replace broken DUTs in 242 order to reach the target; negative numbers indicate that 243 no spares are needed, and that a corresponding number of 244 working devices can be returned. 245 246 If the new target total would require returning ineligible 247 DUTs, an error is logged, and the target total is adjusted 248 so that those DUTs are not exchanged. 249 250 @param target_total The new target pool size. 251 252 @return The number of spares needed. 253 254 """ 255 num_ineligible = len(self.ineligible_hosts) 256 spares_needed = target_total >= num_ineligible 257 metrics.Boolean( 258 'chromeos/autotest/balance_pools/exhausted_pools', 259 'True for each pool/model which requests more DUTs than supplied', 260 # TODO(jrbarnette) The 'board' field is a legacy. We need 261 # to leave it here until we do the extra work Monarch 262 # requires to delete a field. 263 field_spec=[ 264 ts_mon.StringField('pool'), 265 ts_mon.StringField('board'), 266 ts_mon.StringField('model'), 267 ]).set( 268 not spares_needed, 269 fields={ 270 'pool': self.pool, 271 'board': self.labels.get('model', ''), 272 'model': self.labels.get('model', ''), 273 }, 274 ) 275 if not spares_needed: 276 _log_error( 277 '%s pool (%s): Target of %d is below minimum of %d DUTs.', 278 self.pool, self.labels, target_total, num_ineligible, 279 ) 280 _log_error('Adjusting target to %d DUTs.', num_ineligible) 281 target_total = num_ineligible 282 else: 283 _log_message('%s %s pool: Target of %d is above minimum.', 284 self.labels.get('model', ''), self.pool, target_total) 285 adjustment = target_total - self.total_hosts 286 return len(self.broken_hosts) + adjustment 287 288 def allocate_surplus(self, num_broken): 289 """Allocate a list DUTs that can returned as surplus. 290 291 Return a list of devices that can be returned in order to 292 reduce this pool's supply. Broken DUTs will be preferred 293 over working ones. 294 295 The `num_broken` parameter indicates the number of broken 296 DUTs to be left in the pool. If this number exceeds the 297 number of broken DUTs actually in the pool, the returned 298 list will be empty. If this number is negative, it 299 indicates a number of working DUTs to be returned in 300 addition to all broken ones. 301 302 @param num_broken Total number of broken DUTs to be left in 303 this pool. 304 305 @return A list of DUTs to be returned as surplus. 306 307 """ 308 if num_broken >= 0: 309 surplus = self.broken_hosts[num_broken:] 310 return surplus 311 else: 312 return (self.broken_hosts + 313 self.working_hosts[:-num_broken]) 314 315 316 def _exchange_labels(dry_run, hosts, target_pool, spare_pool): 317 """Reassign a list of DUTs from one pool to another. 318 319 For all the given hosts, remove all labels associated with 320 `spare_pool`, and add the labels for `target_pool`. 321 322 If `dry_run` is true, perform no changes, but log the `atest` 323 commands needed to accomplish the necessary label changes. 324 325 @param dry_run Whether the logging is for a dry run or 326 for actual execution. 327 @param hosts List of DUTs (AFE hosts) to be reassigned. 328 @param target_pool The `_DUTPool` object from which the hosts 329 are drawn. 330 @param spare_pool The `_DUTPool` object to which the hosts 331 will be added. 332 333 """ 334 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 335 len(hosts), spare_pool.pool, target_pool.pool) 336 metrics.Counter( 337 'chromeos/autotest/balance_pools/duts_moved', 338 'DUTs transferred between pools', 339 # TODO(jrbarnette) The 'board' field is a legacy. We need to 340 # leave it here until we do the extra work Monarch requires to 341 # delete a field. 342 field_spec=[ 343 ts_mon.StringField('board'), 344 ts_mon.StringField('model'), 345 ts_mon.StringField('source_pool'), 346 ts_mon.StringField('target_pool'), 347 ] 348 ).increment_by( 349 len(hosts), 350 fields={ 351 'board': target_pool.labels.get('model', ''), 352 'model': target_pool.labels.get('model', ''), 353 'source_pool': spare_pool.pool, 354 'target_pool': target_pool.pool, 355 }, 356 ) 357 if not hosts: 358 return 359 360 additions = target_pool.pool_labels 361 removals = spare_pool.pool_labels 362 for host in hosts: 363 if not dry_run: 364 _log_message('Updating host: %s.', host.hostname) 365 host.remove_labels(removals) 366 host.add_labels(additions) 367 else: 368 _log_message('atest label remove -m %s %s', 369 host.hostname, ' '.join(removals)) 370 _log_message('atest label add -m %s %s', 371 host.hostname, ' '.join(additions)) 372 373 374 def _balance_model(arguments, afe, pool, labels, start_time, end_time): 375 """Balance one model as requested by command line arguments. 376 377 @param arguments Parsed command line arguments. 378 @param afe AFE object to be used for the changes. 379 @param pool Pool of the model to be balanced. 380 @param labels Restrict the balancing operation within DUTs 381 that have these labels. 382 @param start_time Start time for HostJobHistory objects in 383 the DUT pools. 384 @param end_time End time for HostJobHistory objects in the 385 DUT pools. 386 387 """ 388 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) 389 main_pool = _DUTPool(afe, pool, labels, start_time, end_time) 390 391 target_total = main_pool.total_hosts 392 if arguments.total is not None: 393 target_total = arguments.total 394 elif arguments.grow: 395 target_total += arguments.grow 396 elif arguments.shrink: 397 target_total -= arguments.shrink 398 399 spares_needed = main_pool.calculate_spares_needed(target_total) 400 if spares_needed > 0: 401 spare_duts = spare_pool.working_hosts[:spares_needed] 402 shortfall = spares_needed - len(spare_duts) 403 else: 404 spare_duts = [] 405 shortfall = spares_needed 406 407 surplus_duts = main_pool.allocate_surplus(shortfall) 408 409 if spares_needed or surplus_duts or arguments.verbose: 410 dry_run = arguments.dry_run 411 _log_message('') 412 413 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) 414 _log_info(dry_run, 415 'Total %d DUTs, %d working, %d broken, %d reserved.', 416 main_pool.total_hosts, len(main_pool.working_hosts), 417 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 418 419 if spares_needed > 0: 420 add_msg = 'grow pool by %d DUTs' % spares_needed 421 elif spares_needed < 0: 422 add_msg = 'shrink pool by %d DUTs' % -spares_needed 423 else: 424 add_msg = 'no change to pool size' 425 _log_info(dry_run, 'Target is %d working DUTs; %s.', 426 target_total, add_msg) 427 428 _log_info(dry_run, 429 '%s %s pool has %d spares available for balancing pool %s', 430 labels, spare_pool.pool, len(spare_pool.working_hosts), 431 main_pool.pool) 432 433 if spares_needed > len(spare_duts): 434 _log_error('Not enough spares: need %d, only have %d.', 435 spares_needed, len(spare_duts)) 436 elif shortfall >= 0: 437 _log_info(dry_run, 438 '%s %s pool will return %d broken DUTs, ' 439 'leaving %d still in the pool.', 440 labels, main_pool.pool, 441 len(surplus_duts), 442 len(main_pool.broken_hosts) - len(surplus_duts)) 443 else: 444 _log_info(dry_run, 445 '%s %s pool will return %d surplus DUTs, ' 446 'including %d working DUTs.', 447 labels, main_pool.pool, 448 len(main_pool.broken_hosts) - shortfall, 449 -shortfall) 450 451 if (len(main_pool.broken_hosts) > arguments.max_broken and 452 not arguments.force_rebalance): 453 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 454 labels, main_pool.pool, len(main_pool.broken_hosts)) 455 _log_error('Please investigate this model to for a bug ') 456 _log_error('that is bricking devices. Once you have finished your ') 457 _log_error('investigation, you can force a rebalance with ') 458 _log_error('--force-rebalance') 459 spare_duts = [] 460 surplus_duts = [] 461 462 if not spare_duts and not surplus_duts: 463 if arguments.verbose: 464 _log_info(arguments.dry_run, 'No exchange required.') 465 466 _exchange_labels(arguments.dry_run, surplus_duts, 467 spare_pool, main_pool) 468 _exchange_labels(arguments.dry_run, spare_duts, 469 main_pool, spare_pool) 470 471 472 def _too_many_broken(inventory, pool, args): 473 """ 474 Get the inventory of models and check if too many are broken. 475 476 @param inventory: _LabInventory object. 477 @param pool: The pool to check. 478 @param args: Parsed command line arguments. 479 480 @return True if the number of models with 1 or more broken duts 481 exceed max_broken_models, False otherwise. 482 """ 483 # Were we asked to skip this check? 484 if (args.force_rebalance or 485 (args.all_models and args.max_broken_models == 0)): 486 return False 487 488 max_broken = args.max_broken_models 489 if max_broken is None: 490 total_num = len(inventory.get_pool_models(pool)) 491 max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num) 492 _log_info(args.dry_run, 493 'Max broken models for pool %s: %d', 494 pool, max_broken) 495 496 broken = [model for model, counts in inventory.iteritems() 497 if counts.get_broken(pool) != 0] 498 _log_message('There are %d models in the %s pool with at least 1 ' 499 'broken DUT (max threshold %d)', 500 len(broken), pool, max_broken) 501 for b in sorted(broken): 502 _log_message(b) 503 return len(broken) > max_broken 504 505 506 def _parse_command(argv): 507 """Parse the command line arguments. 508 509 Create an argument parser for this command's syntax, parse the 510 command line, and return the result of the `ArgumentParser` 511 `parse_args()` method. 512 513 @param argv Standard command line argument vector; `argv[0]` is 514 assumed to be the command name. 515 516 @return Result returned by `ArgumentParser.parse_args()`. 517 518 """ 519 parser = argparse.ArgumentParser( 520 prog=os.path.basename(argv[0]), 521 description='Balance pool shortages from spares on reserve') 522 523 parser.add_argument( 524 '-w', '--web', type=str, default=None, 525 help='AFE host to use. Default comes from shadow_config.', 526 ) 527 count_group = parser.add_mutually_exclusive_group() 528 count_group.add_argument('-t', '--total', type=int, 529 metavar='COUNT', default=None, 530 help='Set the number of DUTs in the ' 531 'pool to the specified count for ' 532 'every MODEL') 533 count_group.add_argument('-a', '--grow', type=int, 534 metavar='COUNT', default=None, 535 help='Add the specified number of DUTs ' 536 'to the pool for every MODEL') 537 count_group.add_argument('-d', '--shrink', type=int, 538 metavar='COUNT', default=None, 539 help='Remove the specified number of DUTs ' 540 'from the pool for every MODEL') 541 542 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, 543 metavar='POOL', 544 help='Pool from which to draw replacement ' 545 'spares (default: pool:%s)' % _SPARE_DEFAULT) 546 parser.add_argument('-n', '--dry-run', action='store_true', 547 help='Report actions to take in the form of ' 548 'shell commands') 549 parser.add_argument('-v', '--verbose', action='store_true', 550 help='Print more detail about calculations for debug ' 551 'purposes.') 552 553 parser.add_argument('-m', '--max-broken', default=2, type=int, 554 metavar='COUNT', 555 help='Only rebalance a pool if it has at most ' 556 'COUNT broken DUTs.') 557 parser.add_argument('-f', '--force-rebalance', action='store_true', 558 help='Forcefully rebalance all DUTs in a pool, even ' 559 'if it has a large number of broken DUTs. ' 560 'Before doing this, please investigate whether ' 561 'there is a bug that is bricking devices in the ' 562 'lab.') 563 parser.add_argument('--production', action='store_true', 564 help='Treat this as a production run. This will ' 565 'collect metrics.') 566 567 parser.add_argument( 568 '--all-models', 569 action='store_true', 570 help='Rebalance all managed models. This will do a very expensive ' 571 'check to see how many models have at least one broken DUT. ' 572 'To bypass that check, set --max-broken-models to 0.', 573 ) 574 parser.add_argument( 575 '--max-broken-models', default=None, type=int, metavar='COUNT', 576 help='Only rebalance all models if number of models with broken ' 577 'DUTs in the specified pool is less than COUNT.', 578 ) 579 580 parser.add_argument('pool', 581 metavar='POOL', 582 help='Name of the pool to balance. Use %s to balance ' 583 'all critical pools' % _ALL_CRITICAL_POOLS) 584 parser.add_argument('models', nargs='*', metavar='MODEL', 585 help='Names of models to balance.') 586 587 parser.add_argument('-p', '--phase', metavar='PHASE', 588 help='Optional phase label to restrict balance ' 589 'operation to.') 590 591 parser.add_argument('--sku', type=str, 592 help='Optional name of sku to restrict to.') 593 594 arguments = parser.parse_args(argv[1:]) 595 596 # Error-check arguments. 597 if arguments.models and arguments.all_models: 598 parser.error('Cannot specify individual models on the command line ' 599 'when using --all-models.') 600 if (arguments.pool == _ALL_CRITICAL_POOLS and 601 arguments.spare != _SPARE_DEFAULT): 602 parser.error('Cannot specify --spare pool to be %s when balancing all ' 603 'critical pools.' % _SPARE_DEFAULT) 604 for p in (arguments.spare, arguments.pool): 605 if not _VALID_POOL_PATTERN.match(p): 606 parser.error('Invalid pool name: %s' % p) 607 return arguments 608 609 610 def infer_balancer_targets(afe, arguments, pools): 611 """Take some arguments and translate them to a list of models to balance 612 613 Args: 614 @param afe AFE object to be used for taking inventory. 615 @param arguments Parsed command line arguments. 616 @param pools The list of pools to balance. 617 618 @returns a list of (model, labels) tuples to be balanced 619 620 """ 621 balancer_targets = [] 622 623 for pool in pools: 624 if arguments.all_models: 625 inventory = lab_inventory.get_inventory(afe) 626 quarantine = _too_many_broken(inventory, pool, arguments) 627 if quarantine: 628 _log_error('Refusing to balance all models for %s pool, ' 629 'too many models with at least 1 broken DUT ' 630 'detected.', pool) 631 else: 632 for model in inventory.get_pool_models(pool): 633 labels = labellib.LabelsMapping() 634 labels['model'] = model 635 if arguments.phase: 636 labels['phase'] = arguments.phase 637 balancer_targets.append((pool, labels.getlabels())) 638 metrics.Boolean( 639 'chromeos/autotest/balance_pools/unchanged_pools').set( 640 quarantine, fields={'pool': pool}) 641 _log_message('Pool %s quarantine status: %s', pool, quarantine) 642 else: 643 for model in arguments.models: 644 labels = labellib.LabelsMapping() 645 labels['model'] = model 646 if arguments.sku: 647 labels['sku'] = arguments.sku 648 if arguments.phase: 649 labels['phase'] = arguments.phase 650 balancer_targets.append((pool, labels.getlabels())) 651 return balancer_targets 652 653 654 def main(argv): 655 """Standard main routine. 656 657 @param argv Command line arguments including `sys.argv[0]`. 658 659 """ 660 arguments = _parse_command(argv) 661 if arguments.production: 662 metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools', 663 indirect=True) 664 else: 665 metrics_manager = site_utils.TrivialContextManager() 666 667 with metrics_manager: 668 with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'): 669 end_time = time.time() 670 start_time = end_time - 24 * 60 * 60 671 afe = frontend_wrappers.RetryingAFE(server=arguments.web) 672 673 def balancer(pool, labels): 674 """Balance the specified model. 675 676 @param pool: The pool to rebalance for the model. 677 @param labels: labels to restrict to balancing operations 678 within. 679 """ 680 _balance_model(arguments, afe, pool, labels, 681 start_time, end_time) 682 _log_message('') 683 684 pools = (lab_inventory.CRITICAL_POOLS 685 if arguments.pool == _ALL_CRITICAL_POOLS 686 else [arguments.pool]) 687 balancer_targets = infer_balancer_targets(afe, arguments, pools) 688 try: 689 parallel.RunTasksInProcessPool( 690 balancer, 691 balancer_targets, 692 processes=8, 693 ) 694 except KeyboardInterrupt: 695 pass 696 697 698 if __name__ == '__main__': 699 main(sys.argv) 700