1 #!/usr/bin/python 2 # 3 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 4 # Use of this source code is governed by a BSD-style license that can be 5 # found in the LICENSE file. 6 7 8 """Tool for running suites of tests and waiting for completion. 9 10 The desired test suite will be scheduled with autotest. By default, 11 this tool will block until the job is complete, printing a summary 12 at the end. Error conditions result in exceptions. 13 14 This is intended for use only with Chrome OS test suits that leverage the 15 dynamic suite infrastructure in server/cros/dynamic_suite.py. 16 17 This script exits with one of the following codes: 18 0 - OK: Suite finished successfully 19 1 - ERROR: Test(s) failed, or hits its own timeout 20 2 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out. 21 3 - INFRA_FAILURE: Infrastructure related issues, e.g. 22 * Lab is down 23 * Too many duts (defined as a constant) in repair failed status 24 * Suite job issues, like bug in dynamic suite, 25 user aborted the suite, lose a drone/all devservers/rpc server, 26 0 tests ran, etc. 27 * provision failed 28 TODO(fdeng): crbug.com/413918, reexamine treating all provision 29 failures as INFRA failures. 30 4 - SUITE_TIMEOUT: Suite timed out, some tests ran, 31 none failed by the time the suite job was aborted. This will cover, 32 but not limited to, the following cases: 33 * A devserver failure that manifests as a timeout 34 * No DUTs available midway through a suite 35 * Provision/Reset/Cleanup took longer time than expected for new image 36 * A regression in scheduler tick time. 37 5- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool. 38 6- INVALID_OPTIONS: If options are not valid. 39 """ 40 41 import argparse 42 import ast 43 import collections 44 from collections import namedtuple 45 from datetime import datetime 46 from datetime import timedelta 47 import functools 48 import getpass 49 import json 50 import logging 51 import os 52 import re 53 import sys 54 import time 55 import warnings 56 57 import common 58 from chromite.lib import buildbot_annotations as annotations 59 60 from autotest_lib.client.common_lib import control_data 61 from autotest_lib.client.common_lib import error 62 from autotest_lib.client.common_lib import global_config, enum 63 from autotest_lib.client.common_lib import priorities 64 from autotest_lib.client.common_lib import time_utils 65 from autotest_lib.client.common_lib.cros import retry 66 from autotest_lib.frontend.afe import rpc_client_lib 67 from autotest_lib.frontend.afe.json_rpc import proxy 68 from autotest_lib.server import site_utils 69 from autotest_lib.server import utils 70 from autotest_lib.server.cros import provision 71 from autotest_lib.server.cros.dynamic_suite import constants 72 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 73 from autotest_lib.server.cros.dynamic_suite import reporting_utils 74 from autotest_lib.server.cros.dynamic_suite import tools 75 from autotest_lib.site_utils import diagnosis_utils 76 from autotest_lib.site_utils import job_overhead 77 78 CONFIG = global_config.global_config 79 80 _DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value( 81 'SERVER', 'hostname', type=str) 82 _URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str) 83 84 # Return code that will be sent back to autotest_rpc_server.py 85 RETURN_CODES = enum.Enum( 86 'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT', 87 'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS') 88 89 # Minimum RPC timeout setting for calls expected to take long time, e.g., 90 # create_suite_job. If default socket time (socket.getdefaulttimeout()) is 91 # None or greater than this value, the default will be used. 92 # The value here is set to be the same as the timeout for the RetryingAFE object 93 # so long running RPCs can wait long enough before being aborted. 94 _MIN_RPC_TIMEOUT = 600 95 96 # Number of days back to search for existing job. 97 _SEARCH_JOB_MAX_DAYS = 14 98 99 _PROVISION_SUITE = 'provision' 100 101 102 @functools.total_ordering 103 class _ReturnResult(object): 104 """Represents overall result of run_suite operation. 105 106 _ReturnResult instances sort based on priority (the order in 107 _RETURN_RESULTS). 108 109 Furthermore, _ReturnResult instances can be combined by bitwise or 110 ("union"), which returns the instance with the higher priority 111 between the two (the instance with higher priority is a "superset" 112 of the other). 113 114 Do not create new instances of this; use _RETURN_RESULTS instead. 115 """ 116 117 def __init__(self, return_code, message): 118 self.return_code = return_code 119 self.message = message 120 121 def __repr__(self): 122 return '<{cls} {key}, {this.return_code}, {this.message}>'.format( 123 cls=type(self).__name__, 124 key=self._getkey(), 125 this=self) 126 127 def __gt__(self, other): 128 if isinstance(other, type(self)): 129 return self._getkey() > other._getkey() 130 else: 131 return NotImplemented 132 133 def __eq__(self, other): 134 if isinstance(other, type(self)): 135 return (self.return_code == other.return_code 136 and self.message == other.message) 137 else: 138 return NotImplemented 139 140 def __hash__(self): 141 return hash(self.return_code) ^ hash(self.message) 142 143 def __or__(self, other): 144 if isinstance(other, type(self)): 145 if self > other: 146 return self 147 else: 148 return other 149 else: 150 return NotImplemented 151 152 def _getkey(self): 153 """Return sort key.""" 154 return _RETURN_RESULTS_LIST.index(self) 155 156 def suite_result(self, output_dict=None): 157 """Make a SuiteResult using this _ReturnResult. 158 159 @param output_dict: output_dict to merge into SuiteResult. 160 """ 161 if output_dict is None: 162 output_dict = dict() 163 else: 164 output_dict = output_dict.copy() 165 if self.message: 166 output_dict['return_message'] = self.message 167 return SuiteResult(self.return_code, output_dict) 168 169 170 _RETURN_RESULTS = collections.OrderedDict([ 171 ('ok', _ReturnResult(RETURN_CODES.OK, '')), 172 173 ('test_warning', _ReturnResult( 174 RETURN_CODES.WARNING, 'Test job raised warning.')), 175 ('suite_warning', _ReturnResult( 176 RETURN_CODES.WARNING, 'Suite job raised warning.')), 177 ('test_retry', _ReturnResult( 178 RETURN_CODES.WARNING, 'Tests were retried.')), 179 180 ('test_aborted_prestart', _ReturnResult( 181 RETURN_CODES.SUITE_TIMEOUT, 182 'Tests were aborted before running; suite must have timed out.')), 183 # This really indicates a user action or an infra failure. But, suite 184 # timeouts cause similar fauilres in the individual tests, so we must 185 # classify these lower than suite_timeout. In case of a suite_timeout, the 186 # result from the suite job will promote the result to suite_timeout. 187 ('test_aborted_mystery', 188 _ReturnResult( 189 RETURN_CODES.SUITE_TIMEOUT, 190 'Tests were aborted after running, but before timeout; ' 191 'Test was manually aborted or parsing results failed: ' 192 'crbug.com/796348.')), 193 ('suite_timeout', _ReturnResult( 194 RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')), 195 196 ('test_views_missing', _ReturnResult( 197 RETURN_CODES.INFRA_FAILURE, 'No test views found.')), 198 ('suite_failed', _ReturnResult( 199 RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')), 200 ('provision_failed', _ReturnResult( 201 RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')), 202 203 ('test_failure', _ReturnResult( 204 RETURN_CODES.ERROR, 'Tests failed.')), 205 ]) 206 _RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values()) 207 208 209 def bool_str(x): 210 """Boolean string type for option arguments. 211 212 @param x: string representation of boolean value. 213 214 """ 215 if x == 'True': 216 return True 217 elif x == 'False': 218 return False 219 else: 220 raise argparse.ArgumentTypeError( 221 '%s is not one of True or False' % (x,)) 222 223 224 def _get_priority_value(x): 225 """Convert a priority representation to its int value. 226 227 Priorities can be described either by an int value (possibly as a string) 228 or a name string. This function coerces both forms to an int value. 229 230 This function is intended for casting command line arguments during 231 parsing. 232 233 @param x: priority value as an int, int string, or name string 234 235 @returns: int value of priority 236 """ 237 try: 238 return int(x) 239 except ValueError: 240 try: 241 return priorities.Priority.get_value(x) 242 except AttributeError: 243 raise argparse.ArgumentTypeError( 244 'Unknown priority level %s. Try one of %s.' 245 % (x, ', '.join(priorities.Priority.names))) 246 247 248 def make_parser(): 249 """Make ArgumentParser instance for run_suite.py.""" 250 parser = argparse.ArgumentParser( 251 usage="%(prog)s [options]") 252 parser.add_argument("-b", "--board", dest="board") 253 parser.add_argument( 254 "--model", 255 help="The device model to run tests against. For non-unified " 256 "builds, model and board are synonymous, but board is more " 257 "accurate in some cases. Only pass this option if your build " 258 "is a unified build.", 259 ) 260 parser.add_argument("-i", "--build", dest="build") 261 parser.add_argument( 262 "-w", "--web", dest="web", default=None, 263 help="Address of a webserver to receive suite requests.") 264 parser.add_argument( 265 '--cheets_build', dest='cheets_build', default=None, 266 help='ChromeOS Android build to be installed on dut.') 267 parser.add_argument( 268 '--firmware_rw_build', dest='firmware_rw_build', default=None, 269 help='Firmware build to be installed in dut RW firmware.') 270 parser.add_argument( 271 '--firmware_ro_build', dest='firmware_ro_build', default=None, 272 help='Firmware build to be installed in dut RO firmware.') 273 parser.add_argument( 274 '--test_source_build', dest='test_source_build', default=None, 275 help=('Build that contains the test code, ' 276 'e.g., it can be the value of `--build`, ' 277 '`--firmware_rw_build` or `--firmware_ro_build` ' 278 'arguments. Default is None, that is, use the test ' 279 'code from `--build` (CrOS image)')) 280 # This should just be a boolean flag, but the autotest "proxy" code 281 # can't handle flags that don't take arguments. 282 parser.add_argument( 283 "-n", "--no_wait", dest="no_wait", default=False, type=bool_str, 284 help='Must pass "True" or "False" if used.') 285 # If you really want no pool, --pool="" will do it. USE WITH CARE. 286 parser.add_argument("-p", "--pool", dest="pool", default="suites") 287 parser.add_argument("-s", "--suite_name", dest="name") 288 parser.add_argument("-a", "--afe_timeout_mins", type=int, 289 dest="afe_timeout_mins", default=30) 290 parser.add_argument("-t", "--timeout_mins", type=int, 291 dest="timeout_mins", default=1440) 292 parser.add_argument("-x", "--max_runtime_mins", type=int, 293 dest="max_runtime_mins", default=1440) 294 parser.add_argument("-d", "--delay_sec", type=int, 295 dest="delay_sec", default=10) 296 parser.add_argument("-m", "--mock_job_id", dest="mock_job_id", 297 help="Attach to existing job id for already running " 298 "suite, and creates report.") 299 # NOTE(akeshet): This looks similar to --no_wait, but behaves differently. 300 # --no_wait is passed in to the suite rpc itself and affects the suite, 301 # while this does not. 302 parser.add_argument("-c", "--create_and_return", dest="create_and_return", 303 action="store_true", 304 help="Create the suite and print the job id, then " 305 "finish immediately.") 306 parser.add_argument("-u", "--num", dest="num", type=int, default=None, 307 help="Deprecated, does nothing.") 308 # Same boolean flag issue applies here. 309 parser.add_argument( 310 "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str, 311 help=('File bugs on test failures. Must pass "True" or ' 312 '"False" if used.')) 313 parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus", 314 action="store_true", help='Bypass lab status check.') 315 # We allow either a number or a string for the priority. This way, if you 316 # know what you're doing, one can specify a custom priority level between 317 # other levels. 318 parser.add_argument("-r", "--priority", dest="priority", 319 type=_get_priority_value, 320 default=priorities.Priority.DEFAULT, 321 action="store", 322 help="Priority of suite. Either numerical value, or " 323 "one of (" + ", ".join(priorities.Priority.names) 324 + ").") 325 parser.add_argument( 326 '--retry', dest='retry', default=False, type=bool_str, action='store', 327 help='Enable test retry. Must pass "True" or "False" if used.') 328 parser.add_argument('--max_retries', dest='max_retries', default=None, 329 type=int, action='store', help='Maximum retries' 330 'allowed at suite level. No limit if not specified.') 331 parser.add_argument('--minimum_duts', dest='minimum_duts', type=int, 332 default=0, action='store', 333 help='Check that the pool has at least such many ' 334 'healthy machines, otherwise suite will not run. ' 335 'Default to 0.') 336 parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int, 337 default=0, action='store', 338 help='Preferred minimum number of machines. Scheduler ' 339 'will prioritize on getting such many machines for ' 340 'the suite when it is competing with another suite ' 341 'that has a higher priority but already got minimum ' 342 'machines it needs. Default to 0.') 343 parser.add_argument("--suite_args", dest="suite_args", 344 type=ast.literal_eval, 345 default=None, action="store", 346 help="A dict of args passed to the suite control file.") 347 parser.add_argument('--offload_failures_only', 348 dest='offload_failures_only', type=bool_str, 349 action='store', default=False, 350 help='Only enable gs_offloading for failed tests. ' 351 'Successful tests will be deleted. Must pass "True"' 352 ' or "False" if used.') 353 parser.add_argument('--use_suite_attr', dest='use_suite_attr', 354 action='store_true', default=False, 355 help='Advanced. Run the suite based on ATTRIBUTES of ' 356 'control files, rather than SUITE.') 357 parser.add_argument('--json_dump', dest='json_dump', action='store_true', 358 default=False, 359 help='Dump the output of run_suite to stdout.') 360 parser.add_argument( 361 '--run_prod_code', dest='run_prod_code', 362 action='store_true', default=False, 363 help='Run the test code that lives in prod aka the test ' 364 'code currently on the lab servers.') 365 parser.add_argument( 366 '--delay_minutes', type=int, default=0, 367 help=('Delay the creation of test jobs for a given ' 368 'number of minutes. This argument can be used to ' 369 'force provision jobs being delayed, which helps ' 370 'to distribute loads across devservers.')) 371 parser.add_argument( 372 '--skip_duts_check', dest='skip_duts_check', action='store_true', 373 default=False, help='If True, skip minimum available DUTs check') 374 parser.add_argument( 375 '--job_keyvals', dest='job_keyvals', type=ast.literal_eval, 376 action='store', default=None, 377 help='A dict of job keyvals to be inject to suite control file') 378 parser.add_argument( 379 '--test_args', dest='test_args', type=ast.literal_eval, 380 action='store', default=None, 381 help=('A dict of args passed all the way to each individual test that ' 382 'will be actually ran.')) 383 parser.add_argument( 384 '--require_logfile', action='store_true', 385 help=('Stream logs of run_suite.py to a local file named ' 386 'run_suite-<build name>.log.')) 387 388 # Used for monitoring purposes, to measure no-op swarming proxy latency. 389 parser.add_argument('--do_nothing', action='store_true', 390 help=argparse.SUPPRESS) 391 392 # Used when lab/job status checking is needed. Currently its only user is 393 # suite scheduler v2. 394 parser.add_argument( 395 '--pre_check', action='store_true', 396 help=('Check lab and job status before kicking off a suite. Used by ' 397 'suite scheduler v2.')) 398 399 # TODO(crbug.com/763207): This is to support calling old moblab RPC 400 # with ToT code. This does not need to be supported after M62. 401 parser.add_argument('--oldrpc', action='store_true', 402 help='Use old AFE RPC.') 403 404 return parser 405 406 407 def verify_and_clean_options(options): 408 """Verify the validity of options. 409 410 @param options: The parsed options to verify. 411 412 @returns: True if verification passes, False otherwise. 413 414 """ 415 if options.mock_job_id and ( 416 not options.build or not options.name or not options.board): 417 print ('When using -m, need to specify build, board and suite ' 418 'name which you have used for creating the original job') 419 return False 420 else: 421 if not options.build: 422 print 'Need to specify which build to use' 423 return False 424 if not options.board: 425 print 'Need to specify board' 426 return False 427 if not options.name: 428 print 'Need to specify suite name' 429 return False 430 if options.num is not None: 431 warnings.warn('-u/--num option is deprecated; it does nothing.') 432 del options.num 433 if not options.retry and options.max_retries is not None: 434 print 'max_retries can only be used with --retry=True' 435 return False 436 if options.use_suite_attr and options.suite_args is not None: 437 print ('The new suite control file cannot parse the suite_args: %s.' 438 'Please not specify any suite_args here.' % options.suite_args) 439 return False 440 if options.no_wait and options.retry: 441 print 'Test retry is not available when using --no_wait=True' 442 # Default to use the test code in CrOS build. 443 if not options.test_source_build and options.build: 444 options.test_source_build = options.build 445 return True 446 447 448 def change_options_for_suite_attr(options): 449 """Change options to be prepared to run the suite_attr_wrapper. 450 451 If specify 'use_suite_attr' from the cmd line, it indicates to run the 452 new style suite control file, suite_attr_wrapper. Then, change the 453 options.name to 'suite_attr_wrapper', change the options.suite_args to 454 include the arguments needed by suite_attr_wrapper. 455 456 @param options: The verified options. 457 458 @returns: The changed options. 459 460 """ 461 # Convert the suite_name to attribute boolean expression. 462 if type(options.name) is str: 463 attr_filter_val = 'suite:%s' % options.name 464 else: 465 attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name]) 466 467 # change the suite_args to be a dict of arguments for suite_attr_wrapper 468 # if suite_args is not None, store the values in 'other_args' of the dict 469 args_dict = {} 470 args_dict['attr_filter'] = attr_filter_val 471 options.suite_args = args_dict 472 options.name = 'suite_attr_wrapper' 473 474 return options 475 476 477 class TestResult(object): 478 479 """Represents the result of a TestView.""" 480 481 def __init__(self, test_view, retry_count=0): 482 """Initialize instance. 483 484 @param test_view: TestView instance. 485 @param retry_count: Retry count for test. Optional. 486 """ 487 self.name = test_view.get_testname() 488 self.status = test_view['status'] 489 self.reason = test_view['reason'] 490 self.retry_count = retry_count 491 492 _PRETTY_STATUS_MAP = { 493 'GOOD': '[ PASSED ]', 494 'TEST_NA': '[ INFO ]', 495 } 496 497 @property 498 def _pretty_status(self): 499 """Pretty status string.""" 500 return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]') 501 502 def log_using(self, log_function, name_column_width): 503 """Log the test result using the given log function. 504 505 @param log_function: Log function to use. Example: logging.info 506 @param name_column_width: Width of name column for formatting. 507 """ 508 padded_name = self.name.ljust(name_column_width) 509 log_function('%s%s', padded_name, self._pretty_status) 510 if self.status != 'GOOD': 511 log_function('%s %s: %s', padded_name, self.status, self.reason) 512 if self.retry_count > 0: 513 log_function('%s retry_count: %s', padded_name, self.retry_count) 514 515 516 def get_original_suite_name(suite_name, suite_args): 517 """Get the original suite name when running suite_attr_wrapper. 518 519 @param suite_name: the name of the suite launched in afe. When it is 520 suite_attr_wrapper, the suite that actually running is 521 specified in the suite_args. 522 @param suite_args: dict of suite args from argument parsing. 523 524 @returns: the original suite name. 525 526 """ 527 if suite_name == 'suite_attr_wrapper': 528 attrs = suite_args.get('attr_filter', '') 529 suite_list = ([x[6:] for x in re.split('[() ]', attrs) 530 if x and x.startswith('suite:')]) 531 return suite_list[0] if suite_list else suite_name 532 return suite_name 533 534 535 class LogLink(object): 536 """Information needed to record a link in the logs. 537 538 Depending on context and the information provided at 539 construction time, the link may point to either to log files for 540 a job, or to a bug filed for a failure in the job. 541 542 @var anchor The link text. 543 @var url The link url. 544 @var bug_id Id of a bug to link to, or None. 545 """ 546 547 # A list of tests that don't get retried so skip the dashboard. 548 _SKIP_RETRY_DASHBOARD = ['provision'] 549 550 _BUG_LINK_PREFIX = 'Auto-Bug' 551 _LOG_LINK_PREFIX = 'Test-Logs' 552 553 554 def __init__(self, anchor, server, job_string, bug_info=None, reason=None, 555 retry_count=0, testname=None, sponge_url=None): 556 """Initialize the LogLink by generating the log URL. 557 558 @param anchor The link text. 559 @param server The hostname of the server this suite ran on. 560 @param job_string The job whose logs we'd like to link to. 561 @param bug_info Info about the bug, if one was filed. 562 @param reason A string representing the reason of failure if any. 563 @param retry_count How many times the test has been retried. 564 @param testname Optional Arg that supplies the testname. 565 @param sponge_url url to Sponge result. 566 """ 567 self.anchor = anchor 568 self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server), 569 job_string) 570 self.reason = reason 571 self.retry_count = retry_count 572 self.testname = testname 573 self.sponge_url = sponge_url 574 if bug_info: 575 self.bug_id, self.bug_count = bug_info 576 else: 577 self.bug_id = None 578 self.bug_count = None 579 580 581 @property 582 def bug_url(self): 583 """URL of associated bug.""" 584 if self.bug_id: 585 return reporting_utils.link_crbug(self.bug_id) 586 else: 587 return None 588 589 590 @property 591 def _bug_count_text(self): 592 """Return bug count as human friendly text.""" 593 if self.bug_count is None: 594 bug_info = 'unknown number of reports' 595 elif self.bug_count == 1: 596 bug_info = 'new report' 597 else: 598 bug_info = '%s reports' % self.bug_count 599 return bug_info 600 601 602 def GenerateBuildbotLinks(self): 603 """Generate a link formatted to meet buildbot expectations. 604 605 If there is a bug associated with this link, report a link to the bug 606 and a link to the job logs; otherwise report a link to the job logs. 607 608 @return A generator of links formatted for the buildbot log annotator. 609 """ 610 if self.bug_url: 611 yield self._get_link_to_bug() 612 yield self._get_link_to_job_logs() 613 614 615 def _get_link_to_bug(self): 616 """Return buildbot link to bug. 617 618 @return A link formatted for the buildbot log annotator. 619 """ 620 info_strings = self._get_info_strings() 621 info_strings.append(self._bug_count_text) 622 anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX, 623 info_strings) 624 return annotations.StepLink(anchor_text, self.bug_url) 625 626 627 def _get_link_to_job_logs(self): 628 """Return buildbot link to job logs. 629 630 @return A link formatted for the buildbot log annotator. 631 """ 632 anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX, 633 self._get_info_strings()) 634 return annotations.StepLink(anchor_text, self.url) 635 636 637 def _get_info_strings(self): 638 """Return a list of info strings for _format_anchor_text().""" 639 info_strings = [] 640 if self.retry_count > 0: 641 info_strings.append('retry_count: %d' % self.retry_count) 642 if self.reason: 643 info_strings.append(self.reason) 644 return info_strings 645 646 647 def _format_anchor_text(self, prefix, info_strings): 648 """Format anchor text given a prefix and info strings. 649 650 @param prefix The prefix of the anchor text. 651 @param info_strings Iterable of strings. 652 @return A anchor_text with the right prefix and info strings. 653 """ 654 return '[{prefix}]: {anchor}: {info}'.format( 655 prefix=prefix, 656 anchor=self.anchor.strip(), 657 info=', '.join(info_strings)) 658 659 @property 660 def text_link(self): 661 """Link to the job's logs, for consumption by a human. 662 663 @return A link formatted for human readability. 664 """ 665 return '%s %s' % (self.anchor, self.url) 666 667 def GenerateRetryLink(self): 668 """Generate a link to the retry dashboard. 669 670 @return A link formatted for the buildbot log annotator. 671 """ 672 if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD: 673 return None 674 return annotations.StepLink( 675 text='[Flake-Dashboard]: %s' % self.testname, 676 url=reporting_utils.link_retry_url(self.testname)) 677 678 def GenerateHistoryLink(self): 679 """Generate a link to the test history dashboard. 680 681 @return A link formatted for the buildbot log annotator. 682 """ 683 if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD: 684 return None 685 return annotations.StepLink( 686 text='[Test-History]: %s' % self.testname, 687 url=reporting_utils.link_test_history(self.testname)) 688 689 690 class Timings(object): 691 """Timings for important events during a suite. 692 693 All timestamps are datetime.datetime objects. 694 695 @var suite_job_id: the afe job id of the suite job for which 696 we are recording the timing for. 697 @var download_start_time: the time the devserver starts staging 698 the build artifacts. Recorded in create_suite_job. 699 @var payload_end_time: the time when the artifacts only necessary to start 700 installsing images onto DUT's are staged. 701 Recorded in create_suite_job. 702 @var artifact_end_time: the remaining artifacts are downloaded after we kick 703 off the reimaging job, at which point we record 704 artifact_end_time. Recorded in dynamic_suite.py. 705 @var suite_start_time: the time the suite started. 706 @var tests_start_time: the time the first test started running. 707 @var tests_end_time: the time the last test finished running. 708 """ 709 710 def __init__(self, suite_job_id): 711 self.suite_job_id = suite_job_id 712 # Timings related to staging artifacts on devserver. 713 self.download_start_time = None 714 self.payload_end_time = None 715 self.artifact_end_time = None 716 717 # The test_start_time, but taken off the view that corresponds to the 718 # suite instead of an individual test. 719 self.suite_start_time = None 720 721 # Earliest and Latest tests in the set of TestViews passed to us. 722 self.tests_start_time = None 723 self.tests_end_time = None 724 725 726 def RecordTiming(self, view): 727 """Given a test report view, extract and record pertinent time info. 728 729 get_detailed_test_views() returns a list of entries that provide 730 info about the various parts of a suite run. This method can take 731 any one of these entries and look up timestamp info we might want 732 and record it. 733 734 If timestamps are unavailable, datetime.datetime.min/max will be used. 735 736 @param view: A TestView object. 737 """ 738 start_candidate = datetime.min 739 end_candidate = datetime.max 740 if view['test_started_time']: 741 start_candidate = time_utils.time_string_to_datetime( 742 view['test_started_time']) 743 if view['test_finished_time']: 744 end_candidate = time_utils.time_string_to_datetime( 745 view['test_finished_time']) 746 747 if view.get_testname() == TestView.SUITE_JOB: 748 self.suite_start_time = start_candidate 749 else: 750 self._UpdateFirstTestStartTime(start_candidate) 751 self._UpdateLastTestEndTime(end_candidate) 752 if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view: 753 keyvals = view['job_keyvals'] 754 self.download_start_time = time_utils.time_string_to_datetime( 755 keyvals.get(constants.DOWNLOAD_STARTED_TIME), 756 handle_type_error=True) 757 758 self.payload_end_time = time_utils.time_string_to_datetime( 759 keyvals.get(constants.PAYLOAD_FINISHED_TIME), 760 handle_type_error=True) 761 762 self.artifact_end_time = time_utils.time_string_to_datetime( 763 keyvals.get(constants.ARTIFACT_FINISHED_TIME), 764 handle_type_error=True) 765 766 767 def _UpdateFirstTestStartTime(self, candidate): 768 """Update self.tests_start_time, iff candidate is an earlier time. 769 770 @param candidate: a datetime.datetime object. 771 """ 772 if not self.tests_start_time or candidate < self.tests_start_time: 773 self.tests_start_time = candidate 774 775 776 def _UpdateLastTestEndTime(self, candidate): 777 """Update self.tests_end_time, iff candidate is a later time. 778 779 @param candidate: a datetime.datetime object. 780 """ 781 if not self.tests_end_time or candidate > self.tests_end_time: 782 self.tests_end_time = candidate 783 784 785 def __str__(self): 786 return ('\n' 787 'Suite timings:\n' 788 'Downloads started at %s\n' 789 'Payload downloads ended at %s\n' 790 'Suite started at %s\n' 791 'Artifact downloads ended (at latest) at %s\n' 792 'Testing started at %s\n' 793 'Testing ended at %s\n' % (self.download_start_time, 794 self.payload_end_time, 795 self.suite_start_time, 796 self.artifact_end_time, 797 self.tests_start_time, 798 self.tests_end_time)) 799 800 801 def instance_for_pool(pool_name): 802 """ 803 Return the hostname of the server that should be used to service a suite 804 for the specified pool. 805 806 @param pool_name: The pool (without 'pool:' to schedule the suite against. 807 @return: The correct host that should be used to service this suite run. 808 """ 809 return CONFIG.get_config_value( 810 'POOL_INSTANCE_SHARDING', pool_name, 811 default=_DEFAULT_AUTOTEST_INSTANCE) 812 813 814 class TestView(object): 815 """Represents a test view and provides a set of helper functions.""" 816 817 818 SUITE_JOB = 'Suite job' 819 820 821 def __init__(self, view, afe_job, suite_name, build, user, 822 solo_test_run=False): 823 """Init a TestView object representing a tko test view. 824 825 @param view: A dictionary representing a tko test view. 826 @param afe_job: An instance of frontend.afe.models.Job 827 representing the job that kicked off the test. 828 @param suite_name: The name of the suite 829 that the test belongs to. 830 @param build: The build for which the test is run. 831 @param user: The user for which the test is run. 832 @param solo_test_run: This is a solo test run not part of a suite. 833 """ 834 self.view = view 835 self.afe_job = afe_job 836 self.suite_name = suite_name 837 self.build = build 838 self.is_suite_view = afe_job.parent_job is None and not solo_test_run 839 # This is the test name that will be shown in the output. 840 self.testname = None 841 self.user = user 842 843 # The case that a job was aborted before it got a chance to run 844 # usually indicates suite has timed out (unless aborted by user). 845 # In this case, the abort reason will be None. 846 # Update the reason with proper information. 847 if (self.is_relevant_suite_view() and 848 not self.get_testname() == self.SUITE_JOB and 849 self.view['status'] == 'ABORT' and 850 not self.view['reason']): 851 self.view['reason'] = 'Timed out, did not run.' 852 853 854 def __getitem__(self, key): 855 """Overload __getitem__ so that we can still use [] 856 857 @param key: A key of the tko test view. 858 859 @returns: The value of an attribute in the view. 860 861 """ 862 return self.view[key] 863 864 865 def __iter__(self): 866 """Overload __iter__ so that it supports 'in' operator.""" 867 return iter(self.view) 868 869 870 def get_testname(self): 871 """Get test name that should be shown in the output. 872 873 Formalize the test_name we got from the test view. 874 875 Remove 'build/suite' prefix if any. 876 877 If one runs a test in control file via the following code, 878 job.runtest('my_Test', tag='tag') 879 for most of the cases, view['test_name'] would look like 'my_Test.tag'. 880 If this is the case, this method will just return the original 881 test name, i.e. 'my_Test.tag'. 882 883 There are four special cases. 884 1) A test view is for the suite job's SERVER_JOB. 885 In this case, this method will return 'Suite job'. 886 887 2) A test view is of a child job or a solo test run not part of a 888 suite, and for a SERVER_JOB or CLIENT_JOB. 889 In this case, we will take the job name, remove the build/suite 890 prefix from the job name, and append the rest to 'SERVER_JOB' 891 or 'CLIENT_JOB' as a prefix. So the names returned by this 892 method will look like: 893 'dummy_Pass_SERVER_JOB' 894 'dummy_Fail_SERVER_JOB' 895 896 3) A test view is of a suite job and its status is ABORT. 897 In this case, the view['test_name'] is the child job's name. 898 For instance, 899 'lumpy-release/R35-5712.0.0/dummy/dummy_Pass' 900 'lumpy-release/R35-5712.0.0/dummy/dummy_Fail' 901 The above names will be converted to the following: 902 'dummy_Pass' 903 'dummy_Fail' 904 905 4) A test view's status is of a suite job and its status is TEST_NA. 906 In this case, the view['test_name'] is the NAME field of the control 907 file. For instance, 908 'dummy_Pass' 909 'dummy_Fail' 910 This method will not modify these names. 911 912 @returns: Test name after normalization. 913 914 """ 915 if self.testname is not None: 916 return self.testname 917 918 if (self.is_suite_view and 919 self.view['test_name'].startswith('SERVER_JOB')): 920 # Rename suite job's SERVER_JOB to 'Suite job'. 921 self.testname = self.SUITE_JOB 922 return self.testname 923 924 if (self.view['test_name'].startswith('SERVER_JOB') or 925 self.view['test_name'].startswith('CLIENT_JOB')): 926 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 927 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 928 else: 929 testname = self.view['test_name'] 930 # Remove the build and suite name from testname if any. 931 self.testname = tools.get_test_name( 932 self.build, self.suite_name, testname) 933 return self.testname 934 935 936 def is_relevant_suite_view(self): 937 """Checks whether this is a suite view we should care about. 938 939 @returns: True if it is relevant. False otherwise. 940 """ 941 return (self.get_testname() == self.SUITE_JOB or 942 (self.is_suite_view and 943 not self.view['test_name'].startswith('CLIENT_JOB') and 944 not self.view['subdir'])) 945 946 947 def is_test(self): 948 """Return whether the view is for an actual test. 949 950 @returns True if the view is for an actual test. 951 False if the view is for SERVER_JOB or CLIENT_JOB. 952 953 """ 954 return not (self.view['test_name'].startswith('SERVER_JOB') or 955 self.view['test_name'].startswith('CLIENT_JOB')) 956 957 958 def is_retry(self): 959 """Check whether the view is for a retry. 960 961 @returns: True, if the view is for a retry; False otherwise. 962 963 """ 964 return self.view['job_keyvals'].get('retry_original_job_id') is not None 965 966 967 def hit_timeout(self): 968 """Check whether the corresponding job has hit its own timeout. 969 970 Note this method should not be called for those test views 971 that belongs to a suite job and are determined as irrelevant 972 by is_relevant_suite_view. This is because they are associated 973 to the suite job, whose job start/finished time make no sense 974 to an irrelevant test view. 975 976 @returns: True if the corresponding afe job has hit timeout. 977 False otherwise. 978 """ 979 if (self.is_relevant_suite_view() and 980 self.get_testname() != self.SUITE_JOB): 981 # Any relevant suite test view except SUITE_JOB 982 # did not hit its own timeout because it was not ever run. 983 return False 984 start = (datetime.strptime( 985 self.view['job_started_time'], time_utils.TIME_FMT) 986 if self.view['job_started_time'] else None) 987 end = (datetime.strptime( 988 self.view['job_finished_time'], time_utils.TIME_FMT) 989 if self.view['job_finished_time'] else None) 990 if not start or not end: 991 return False 992 else: 993 return ((end - start).total_seconds()/60.0 994 > self.afe_job.max_runtime_mins) 995 996 997 def is_aborted(self): 998 """Check if the view was aborted. 999 1000 For suite job and child job test views, we check job keyval 1001 'aborted_by' and test status. 1002 1003 For relevant suite job test views, we only check test status 1004 because the suite job keyval won't make sense to individual 1005 test views. 1006 1007 @returns: True if the test was as aborted, False otherwise. 1008 1009 """ 1010 1011 if (self.is_relevant_suite_view() and 1012 self.get_testname() != self.SUITE_JOB): 1013 return self.view['status'] == 'ABORT' 1014 else: 1015 return (bool(self.view['job_keyvals'].get('aborted_by')) and 1016 self.view['status'] in ['ABORT', 'RUNNING']) 1017 1018 1019 def is_in_fail_status(self): 1020 """Check if the given test's status corresponds to a failure. 1021 1022 @returns: True if the test's status is FAIL or ERROR. False otherwise. 1023 1024 """ 1025 # All the statuses tests can have when they fail. 1026 return self.view['status'] in ['FAIL', 'ERROR', 'ABORT'] 1027 1028 1029 def is_provision(self): 1030 """Check whether this is a provision test.""" 1031 return self.get_testname() == 'provision' 1032 1033 1034 def get_buildbot_link_reason(self): 1035 """Generate the buildbot link reason for the test. 1036 1037 @returns: A string representing the reason. 1038 1039 """ 1040 return ('%s: %s' % (self.view['status'], self.view['reason']) 1041 if self.view['reason'] else self.view['status']) 1042 1043 1044 def get_job_id_owner_str(self): 1045 """Generate the job_id_owner string for a test. 1046 1047 @returns: A string which looks like 135036-username 1048 1049 """ 1050 return '%s-%s' % (self.view['afe_job_id'], self.user) 1051 1052 1053 def get_bug_info(self, suite_job_keyvals): 1054 """Get the bug info from suite_job_keyvals. 1055 1056 If a bug has been filed for the test, its bug info (bug id and counts) 1057 will be stored in the suite job's keyvals. This method attempts to 1058 retrieve bug info of the test from |suite_job_keyvals|. It will return 1059 None if no bug info is found. No need to check bug info if the view is 1060 SUITE_JOB. 1061 1062 @param suite_job_keyvals: The job keyval dictionary of the suite job. 1063 All the bug info about child jobs are stored in 1064 suite job's keyvals. 1065 1066 @returns: None if there is no bug info, or a pair with the 1067 id of the bug, and the count of the number of 1068 times the bug has been seen. 1069 1070 """ 1071 if self.get_testname() == self.SUITE_JOB: 1072 return None 1073 if (self.view['test_name'].startswith('SERVER_JOB') or 1074 self.view['test_name'].startswith('CLIENT_JOB')): 1075 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 1076 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 1077 else: 1078 testname = self.view['test_name'] 1079 1080 return tools.get_test_failure_bug_info( 1081 suite_job_keyvals, self.view['afe_job_id'], 1082 testname) 1083 1084 1085 def should_display_buildbot_link(self): 1086 """Check whether a buildbot link should show for this view. 1087 1088 For suite job view, show buildbot link if it fails. 1089 For normal test view, 1090 show buildbot link if it is a retry 1091 show buildbot link if it hits its own timeout. 1092 show buildbot link if it fails. This doesn't 1093 include the case where it was aborted but has 1094 not hit its own timeout (most likely it was aborted because 1095 suite has timed out). 1096 1097 @returns: True if we should show the buildbot link. 1098 False otherwise. 1099 """ 1100 is_bad_status = (self.view['status'] != 'GOOD' and 1101 self.view['status'] != 'TEST_NA') 1102 if self.get_testname() == self.SUITE_JOB: 1103 return is_bad_status 1104 else: 1105 if self.is_retry(): 1106 return True 1107 if is_bad_status: 1108 return not self.is_aborted() or self.hit_timeout() 1109 1110 1111 def get_control_file_attributes(self): 1112 """Get the attributes from the control file of the test. 1113 1114 @returns: A list of test attribute or None. 1115 """ 1116 control_file = self.afe_job.control_file 1117 attributes = None 1118 if control_file: 1119 cd = control_data.parse_control_string(control_file) 1120 attributes = list(cd.attributes) 1121 return attributes 1122 1123 1124 def override_afe_job_id(self, afe_job_id): 1125 """Overrides the AFE job id for the test. 1126 1127 @param afe_job_id: The new AFE job id to use. 1128 """ 1129 self.view['afe_job_id'] = afe_job_id 1130 1131 1132 def log_buildbot_links(log_func, links): 1133 """Output buildbot links to log. 1134 1135 @param log_func: Logging function to use. 1136 @param links: Iterable of LogLink instances. 1137 """ 1138 for link in links: 1139 for generated_link in link.GenerateBuildbotLinks(): 1140 log_func(generated_link) 1141 retry_link = link.GenerateRetryLink() 1142 if retry_link: 1143 log_func(retry_link) 1144 history_link = link.GenerateHistoryLink() 1145 if history_link: 1146 log_func(history_link) 1147 1148 1149 class _ReturnCodeComputer(object): 1150 """This is responsible for returning the _ReturnResult for a suite.""" 1151 1152 def __call__(self, test_views): 1153 """Compute the exit code based on test results.""" 1154 result = _RETURN_RESULTS['ok'] 1155 1156 for v in test_views: 1157 if v.get_testname() == TestView.SUITE_JOB: 1158 result |= self._get_suite_result(v) 1159 else: 1160 result |= self._get_test_result(v) 1161 return result 1162 1163 def _get_suite_result(self, test_view): 1164 """Return the _ReturnResult for the given suite job.""" 1165 # The order of checking each case is important. 1166 if test_view.is_aborted() and test_view.hit_timeout(): 1167 return _RETURN_RESULTS['suite_timeout'] 1168 elif test_view.is_in_fail_status(): 1169 return _RETURN_RESULTS['suite_failed'] 1170 elif test_view['status'] == 'WARN': 1171 return _RETURN_RESULTS['suite_warning'] 1172 else: 1173 return _RETURN_RESULTS['ok'] 1174 1175 def _get_test_result(self, test_view): 1176 """Return the _ReturnResult for the given test job.""" 1177 # The order of checking each case is important. 1178 if test_view.is_aborted() and test_view.is_relevant_suite_view(): 1179 # The test was aborted before started 1180 # This gurantees that the suite has timed out. 1181 return _RETURN_RESULTS['test_aborted_prestart'] 1182 elif test_view.is_aborted() and not test_view.hit_timeout(): 1183 # The test was aborted, but 1184 # not due to a timeout. This is most likely 1185 # because the suite has timed out, but may 1186 # also because it was aborted by the user. 1187 # Since suite timing out is determined by checking 1188 # the suite job view, we simply ignore this view here. 1189 return _RETURN_RESULTS['test_aborted_mystery'] 1190 elif test_view.is_in_fail_status(): # The test job failed 1191 if test_view.is_provision(): 1192 return _RETURN_RESULTS['provision_failed'] 1193 else: 1194 return _RETURN_RESULTS['test_failure'] 1195 elif test_view['status'] == 'WARN': 1196 return _RETURN_RESULTS['test_warning'] 1197 elif test_view.is_retry(): 1198 # The test is a passing retry. 1199 return _RETURN_RESULTS['test_retry'] 1200 else: 1201 return _RETURN_RESULTS['ok'] 1202 1203 1204 class _ProvisionReturnCodeComputer(_ReturnCodeComputer): 1205 """This is used for returning the _ReturnResult for provision suites.""" 1206 1207 def __init__(self, num_required): 1208 """Initialize instance. 1209 1210 num_required is the number of passing provision jobs needed. 1211 """ 1212 super(_ProvisionReturnCodeComputer, self).__init__() 1213 self._num_required = num_required 1214 self._num_successful = 0 1215 1216 def __call__(self, test_views): 1217 result = super(_ProvisionReturnCodeComputer, self).__call__(test_views) 1218 if self._num_successful >= self._num_required: 1219 logging.info('Return result upgraded from %r' 1220 ' due to enough ok provisions', 1221 result) 1222 return _RETURN_RESULTS['ok'] 1223 else: 1224 return result 1225 1226 def _get_test_result(self, test_view): 1227 result = (super(_ProvisionReturnCodeComputer, self) 1228 ._get_test_result(test_view)) 1229 if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}: 1230 self._num_successful += 1 1231 return result 1232 1233 1234 class ResultCollector(object): 1235 """Collect test results of a suite or a single test run. 1236 1237 Once a suite job has finished, use this class to collect test results. 1238 `run` is the core method that is to be called first. Then the caller 1239 could retrieve information like return code, return message, is_aborted, 1240 and timings by accessing the collector's public attributes. And output 1241 the test results and links by calling the 'output_*' methods. 1242 1243 Here is a overview of what `run` method does. 1244 1245 1) Collect the suite job's results from tko_test_view_2. 1246 For the suite job, we only pull test views without a 'subdir'. 1247 A NULL subdir indicates that the test was _not_ executed. This could be 1248 that no child job was scheduled for this test or the child job got 1249 aborted before starts running. 1250 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1251 1252 2) Collect the child jobs' results from tko_test_view_2. 1253 For child jobs, we pull all the test views associated with them. 1254 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1255 1256 3) Generate web and buildbot links. 1257 4) Compute timings of the suite run. 1258 5) Compute the return code based on test results. 1259 1260 @var _instance_server: The hostname of the server that is used 1261 to service the suite. 1262 @var _afe: The afe rpc client. 1263 @var _tko: The tko rpc client. 1264 @var _build: The build for which the suite is run, 1265 e.g. 'lumpy-release/R35-5712.0.0' 1266 @var _board: The target board for which the suite is run, 1267 e.g., 'lumpy', 'link'. 1268 @var _suite_name: The suite name, e.g. 'bvt', 'dummy'. 1269 @var _suite_job_id: The job id of the suite for which we are going to 1270 collect results. 1271 @var _original_suite_name: The suite name we record timing would be 1272 different from _suite_name when running 1273 suite_attr_wrapper. 1274 @var _return_code_function: Called to return what the overall result of 1275 the suite is. 1276 @var _suite_views: A list of TestView objects, representing relevant 1277 test views of the suite job. 1278 @var _child_views: A list of TestView objects, representing test views 1279 of the child jobs. 1280 @var _test_views: A list of TestView objects, representing all test views 1281 from _suite_views and _child_views. 1282 @var _web_links: A list of web links pointing to the results of jobs. 1283 @var buildbot_links: A list of buildbot links for non-passing tests. 1284 @var _solo_test_run: True if this is a single test run. 1285 @var return_result: The _ReturnResult of the suite run. 1286 @var is_aborted: Whether the suite was aborted or not. 1287 True, False or None (aborting status is unknown yet) 1288 @var timings: A Timing object that records the suite's timings. 1289 1290 """ 1291 1292 1293 def __init__(self, instance_server, afe, tko, build, board, 1294 suite_name, suite_job_id, 1295 return_code_function, 1296 original_suite_name=None, 1297 user=None, solo_test_run=False): 1298 self._instance_server = instance_server 1299 self._afe = afe 1300 self._tko = tko 1301 self._build = build 1302 self._board = board 1303 self._suite_name = suite_name 1304 self._suite_job_id = suite_job_id 1305 self._original_suite_name = original_suite_name or suite_name 1306 self._return_code_function = return_code_function 1307 self._suite_views = [] 1308 self._child_views = [] 1309 self._test_views = [] 1310 self._retry_counts = {} 1311 self._missing_results = {} 1312 self._web_links = [] 1313 self.buildbot_links = [] 1314 self._num_child_jobs = 0 1315 self.return_result = None 1316 self.is_aborted = None 1317 self.timings = None 1318 self._user = user or getpass.getuser() 1319 self._solo_test_run = solo_test_run 1320 1321 1322 def _fetch_relevant_test_views_of_suite(self): 1323 """Fetch relevant test views of the suite job. 1324 1325 For the suite job, there will be a test view for SERVER_JOB, and views 1326 for results of its child jobs. For example, assume we've created 1327 a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail, 1328 dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while 1329 dummy_Path.bluetooth got TEST_NA as no duts have bluetooth. 1330 So the suite job's test views would look like 1331 _____________________________________________________________________ 1332 test_idx| job_idx|test_name |subdir |afe_job_id|status 1333 10 | 1000 |SERVER_JOB |---- |40 |GOOD 1334 11 | 1000 |dummy_Pass |NULL |40 |ABORT 1335 12 | 1000 |dummy_Fail.Fail |41-onwer/...|40 |FAIL 1336 13 | 1000 |dummy_Fail.Error |42-owner/...|40 |ERROR 1337 14 | 1000 |dummy_Pass.bluetooth|NULL |40 |TEST_NA 1338 1339 For a suite job, we only care about 1340 a) The test view for the suite job's SERVER_JOB 1341 b) The test views for real tests without a subdir. A NULL subdir 1342 indicates that a test didn't get executed. 1343 So, for the above example, we only keep test views whose test_idxs 1344 are 10, 11, 14. 1345 1346 @returns: A list of TestView objects, representing relevant 1347 test views of the suite job. 1348 1349 """ 1350 suite_job = self._afe.get_jobs(id=self._suite_job_id)[0] 1351 views = self._tko.run(call='get_detailed_test_views', 1352 afe_job_id=self._suite_job_id) 1353 relevant_views = [] 1354 for v in views: 1355 v = TestView(v, suite_job, self._suite_name, self._build, self._user, 1356 solo_test_run=self._solo_test_run) 1357 if v.is_relevant_suite_view(): 1358 # If the test doesn't have results in TKO and is being 1359 # displayed in the suite view instead of the child view, 1360 # then afe_job_id is incorrect and from the suite. 1361 # Override it based on the AFE job id which was missing 1362 # results. 1363 # TODO: This is likely inaccurate if a test has multiple 1364 # tries which all fail TKO parse stage. 1365 if v['test_name'] in self._missing_results: 1366 v.override_afe_job_id( 1367 self._missing_results[v['test_name']][0]) 1368 relevant_views.append(v) 1369 return relevant_views 1370 1371 1372 def _compute_retry_count(self, view): 1373 """Return how many times the test has been retried. 1374 1375 @param view: A TestView instance. 1376 @returns: An int value indicating the retry count. 1377 1378 """ 1379 old_job = view['job_keyvals'].get('retry_original_job_id') 1380 count = 0 1381 while old_job: 1382 count += 1 1383 views = self._tko.run( 1384 call='get_detailed_test_views', afe_job_id=old_job) 1385 old_job = (views[0]['job_keyvals'].get('retry_original_job_id') 1386 if views else None) 1387 return count 1388 1389 1390 def _fetch_test_views_of_child_jobs(self, jobs=None): 1391 """Fetch test views of child jobs. 1392 1393 @returns: A tuple (child_views, retry_counts, missing_results) 1394 child_views is list of TestView objects, representing 1395 all valid views. 1396 retry_counts is a dictionary that maps test_idx to retry 1397 counts. It only stores retry counts that are greater than 0. 1398 missing_results is a dictionary that maps test names to 1399 lists of job ids. 1400 1401 """ 1402 child_views = [] 1403 retry_counts = {} 1404 missing_results = {} 1405 child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id) 1406 if child_jobs: 1407 self._num_child_jobs = len(child_jobs) 1408 for job in child_jobs: 1409 views = [TestView(v, job, self._suite_name, self._build, self._user) 1410 for v in self._tko.run( 1411 call='get_detailed_test_views', afe_job_id=job.id, 1412 invalid=0)] 1413 if len(views) == 0: 1414 missing_results.setdefault(job.name, []).append(job.id) 1415 contains_test_failure = any( 1416 v.is_test() and v['status'] != 'GOOD' for v in views) 1417 for v in views: 1418 if (v.is_test() or 1419 v['status'] != 'GOOD' and not contains_test_failure): 1420 # For normal test view, just keep it. 1421 # For SERVER_JOB or CLIENT_JOB, only keep it 1422 # if it fails and no other test failure. 1423 child_views.append(v) 1424 retry_count = self._compute_retry_count(v) 1425 if retry_count > 0: 1426 retry_counts[v['test_idx']] = retry_count 1427 return child_views, retry_counts, missing_results 1428 1429 1430 def _generate_web_and_buildbot_links(self): 1431 """Generate web links and buildbot links.""" 1432 # TODO(fdeng): If a job was aborted before it reaches Running 1433 # state, we read the test view from the suite job 1434 # and thus this method generates a link pointing to the 1435 # suite job's page for the aborted job. Need a fix. 1436 self._web_links = [] 1437 self.buildbot_links = [] 1438 1439 # Bug info are stored in the suite job's keyvals. 1440 if self._solo_test_run: 1441 suite_job_keyvals = {} 1442 elif not self._suite_views: 1443 suite_job_keyvals = {} 1444 else: 1445 suite_job_keyvals = self._suite_views[0]['job_keyvals'] 1446 1447 for v in self._test_views: 1448 retry_count = self._retry_counts.get(v['test_idx'], 0) 1449 bug_info = v.get_bug_info(suite_job_keyvals) 1450 job_id_owner = v.get_job_id_owner_str() 1451 link = LogLink( 1452 anchor=v.get_testname(), 1453 server=self._instance_server, 1454 job_string=job_id_owner, 1455 bug_info=bug_info, retry_count=retry_count, 1456 testname=v.get_testname(), 1457 sponge_url=suite_job_keyvals.get('sponge_url')) 1458 self._web_links.append(link) 1459 1460 if v.should_display_buildbot_link(): 1461 link.reason = v.get_buildbot_link_reason() 1462 self.buildbot_links.append(link) 1463 1464 1465 def _record_timings(self): 1466 """Record suite timings.""" 1467 self.timings = Timings(self._suite_job_id) 1468 for v in self._test_views: 1469 self.timings.RecordTiming(v) 1470 1471 1472 def _compute_return_code(self): 1473 """Compute the exit code based on test results.""" 1474 self.return_result = self._return_code_function(self._test_views) 1475 1476 1477 def _make_test_results(self): 1478 """Make TestResults for collected tests. 1479 1480 @returns: List of TestResult instances. 1481 """ 1482 test_results = [] 1483 for test_view in self._test_views: 1484 test_result = TestResult( 1485 test_view=test_view, 1486 retry_count=self._retry_counts.get(test_view['test_idx'], 0)) 1487 test_results.append(test_result) 1488 return test_results 1489 1490 1491 def output_results(self): 1492 """Output test results, timings and web links.""" 1493 # Output test results 1494 test_results = self._make_test_results() 1495 if len(test_results) == 0: 1496 max_name_length = 0 1497 else: 1498 max_name_length = max(len(t.name) for t in test_results) 1499 for test_result in test_results: 1500 test_result.log_using(logging.info, max_name_length + 3) 1501 # Output suite timings 1502 logging.info(self.timings) 1503 # Output links to test logs 1504 logging.info('\nLinks to test logs:') 1505 for link in self._web_links: 1506 logging.info(link.text_link) 1507 logging.info('\n') 1508 1509 1510 def get_results_dict(self): 1511 """Write test results, timings and web links into a dict. 1512 1513 @returns: A dict of results in the format like: 1514 { 1515 'tests': { 1516 'test_1': {'status': 'PASSED', 'attributes': [1,2], ...} 1517 'test_2': {'status': 'FAILED', 'attributes': [1],...} 1518 } 1519 'suite_timings': { 1520 'download_start': '1998-07-17 00:00:00', 1521 'payload_download_end': '1998-07-17 00:00:05', 1522 ... 1523 } 1524 } 1525 """ 1526 output_dict = {} 1527 tests_dict = output_dict.setdefault('tests', {}) 1528 for v in self._test_views: 1529 test_name = v.get_testname() 1530 test_info = tests_dict.setdefault(test_name, {}) 1531 test_info.update({ 1532 'status': v['status'], 1533 'attributes': v.get_control_file_attributes() or list(), 1534 'reason': v['reason'], 1535 'retry_count': self._retry_counts.get(v['test_idx'], 0), 1536 }) 1537 # For aborted test, the control file will not be parsed and thus 1538 # fail to get the attributes info. Therefore, the subsystems the 1539 # abort test testing will be missing. For this case, we will assume 1540 # the aborted test will test all subsystems, set subsystem:default. 1541 if (test_info['status'] == 'ABORT' and 1542 not any('subsystem:' in a for a in test_info['attributes'])): 1543 test_info['attributes'].append('subsystem:default') 1544 1545 # Write the links to test logs into the |tests_dict| of |output_dict|. 1546 # For test whose status is not 'GOOD', the link is also buildbot_link. 1547 for link in self._web_links: 1548 test_name = link.anchor.strip() 1549 test_info = tests_dict.get(test_name) 1550 if test_info: 1551 test_info['link_to_logs'] = link.url 1552 test_info['sponge_url'] = link.sponge_url 1553 # Write the retry dashboard link into the dict. 1554 if link in self.buildbot_links and link.testname: 1555 test_info['retry_dashboard_link'] \ 1556 = reporting_utils.link_retry_url(link.testname) 1557 # Always write the wmatrix link for compatibility. 1558 test_info['wmatrix_link'] \ 1559 = reporting_utils.link_wmatrix_retry_url(link.testname) 1560 # Write the bug url into the dict. 1561 if link.bug_id: 1562 test_info['bug_url'] = link.bug_url 1563 1564 # Write the suite timings into |output_dict| 1565 timings = self.timings 1566 if timings is not None: 1567 time_dict = output_dict.setdefault('suite_timings', {}) 1568 time_dict.update({ 1569 'download_start' : str(timings.download_start_time), 1570 'payload_download_end' : str(timings.payload_end_time), 1571 'suite_start' : str(timings.suite_start_time), 1572 'artifact_download_end' : str(timings.artifact_end_time), 1573 'tests_start' : str(timings.tests_start_time), 1574 'tests_end' : str(timings.tests_end_time), 1575 }) 1576 1577 output_dict['suite_job_id'] = self._suite_job_id 1578 1579 return output_dict 1580 1581 1582 def run(self): 1583 """Collect test results. 1584 1585 This method goes through the following steps: 1586 Fetch relevent test views of the suite job. 1587 Fetch test views of child jobs 1588 Check whether the suite was aborted. 1589 Generate links. 1590 Calculate suite timings. 1591 Compute return code based on the test result. 1592 1593 """ 1594 if self._solo_test_run: 1595 self._test_views, self._retry_counts, self._missing_results = ( 1596 self._fetch_test_views_of_child_jobs( 1597 jobs=self._afe.get_jobs(id=self._suite_job_id))) 1598 else: 1599 self._child_views, self._retry_counts, self._missing_results = ( 1600 self._fetch_test_views_of_child_jobs()) 1601 self._suite_views = self._fetch_relevant_test_views_of_suite() 1602 self._test_views = self._suite_views + self._child_views 1603 # For hostless job in Starting status, there is no test view associated. 1604 # This can happen when a suite job in Starting status is aborted. When 1605 # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone, 1606 # max_jobs_started_per_cycle, a suite job can stays in Starting status. 1607 if not self._test_views: 1608 self.return_result = _RETURN_RESULTS['test_views_missing'] 1609 return 1610 self.is_aborted = any([view['job_keyvals'].get('aborted_by') 1611 for view in self._suite_views]) 1612 self._generate_web_and_buildbot_links() 1613 self._record_timings() 1614 self._compute_return_code() 1615 1616 1617 def gather_timing_stats(self): 1618 """Collect timing related statistics.""" 1619 # Record suite runtime in metadata db. 1620 # Some failure modes can leave times unassigned, report sentinel value 1621 # in that case. 1622 runtime_in_secs = -1 1623 if (self.timings.tests_end_time is not None and 1624 self.timings.suite_start_time is not None): 1625 runtime_in_secs = (self.timings.tests_end_time - 1626 self.timings.suite_start_time).total_seconds() 1627 1628 job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name, 1629 self._board, self._build, self._num_child_jobs, runtime_in_secs) 1630 1631 1632 1633 def _make_builds_from_options(options): 1634 """Create a dict of builds for creating a suite job. 1635 1636 The returned dict maps version label prefixes to build names. Together, 1637 each key-value pair describes a complete label. 1638 1639 @param options: SimpleNamespace from argument parsing. 1640 1641 @return: dict mapping version label prefixes to build names 1642 """ 1643 builds = {} 1644 build_prefix = None 1645 if options.build: 1646 build_prefix = provision.get_version_label_prefix(options.build) 1647 builds[build_prefix] = options.build 1648 if options.cheets_build: 1649 builds[provision.CROS_ANDROID_VERSION_PREFIX] = options.cheets_build 1650 if build_prefix == provision.CROS_VERSION_PREFIX: 1651 builds[build_prefix] += provision.CHEETS_SUFFIX 1652 if options.firmware_rw_build: 1653 builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build 1654 if options.firmware_ro_build: 1655 builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build 1656 return builds 1657 1658 1659 def _make_child_deps_from_options(options): 1660 """Creates a list of extra dependencies for child jobs. 1661 1662 @param options: Parsed arguments to run_suite. 1663 1664 @returns: A list of label strings if any dependencies should be added. None 1665 otherwise. 1666 """ 1667 if not options.model: 1668 return () 1669 return ['model:%s' % options.model] 1670 1671 1672 @retry.retry(error.StageControlFileFailure, timeout_min=10) 1673 def create_suite(afe, options): 1674 """Create a suite with retries. 1675 1676 @param afe: The afe object to insert the new suite job into. 1677 @param options: The options to use in creating the suite. 1678 1679 @return: The afe_job_id of the new suite job. 1680 """ 1681 logging.info('%s Submitted create_suite_job rpc', 1682 diagnosis_utils.JobTimer.format_time(datetime.now())) 1683 1684 # TODO(crbug.com/763207): This is to support calling old moblab RPC 1685 # with ToT code. This does not need to be supported after M62. 1686 if options.oldrpc: 1687 suite_args = options.suite_args 1688 if 'tests' in suite_args: 1689 # This is for test_that_wrapper 1690 suite_args = ' '.join([':lab:'] + suite_args['tests']) 1691 else: 1692 # This is for suite_attr_wrapper 1693 suite_args = repr(suite_args) 1694 options.suite_args = suite_args 1695 1696 return afe.run( 1697 'create_suite_job', 1698 name=options.name, 1699 board=options.board, 1700 builds=_make_builds_from_options(options), 1701 test_source_build=options.test_source_build, 1702 check_hosts=not options.no_wait, 1703 pool=options.pool, 1704 file_bugs=options.file_bugs, 1705 priority=options.priority, 1706 suite_args=options.suite_args, 1707 wait_for_results=not options.no_wait, 1708 timeout_mins=options.timeout_mins + options.delay_minutes, 1709 max_runtime_mins=options.max_runtime_mins + options.delay_minutes, 1710 job_retry=options.retry, 1711 max_retries=options.max_retries, 1712 suite_min_duts=options.suite_min_duts, 1713 offload_failures_only=options.offload_failures_only, 1714 run_prod_code=options.run_prod_code, 1715 delay_minutes=options.delay_minutes, 1716 job_keyvals=options.job_keyvals, 1717 test_args=options.test_args, 1718 child_dependencies=_make_child_deps_from_options(options), 1719 ) 1720 1721 1722 class SuiteResult(namedtuple('SuiteResult', ['return_code', 'output_dict'])): 1723 """Result of running a suite to return.""" 1724 1725 def __new__(cls, return_code, output_dict=None): 1726 if output_dict is None: 1727 output_dict = dict() 1728 else: 1729 output_dict = output_dict.copy() 1730 output_dict['return_code'] = return_code 1731 return super(SuiteResult, cls).__new__(cls, return_code, output_dict) 1732 1733 1734 def _run_suite(options): 1735 """ 1736 run_suite script without exception handling. 1737 1738 @param options: The parsed options. 1739 1740 @returns: A tuple contains the return_code of run_suite and the dictionary 1741 of the output. 1742 1743 """ 1744 # If indicate to use the new style suite control file, convert the args 1745 if options.use_suite_attr: 1746 options = change_options_for_suite_attr(options) 1747 1748 log_name = _get_log_name(options) 1749 utils.setup_logging(logfile=log_name) 1750 1751 if not options.bypass_labstatus and not options.web: 1752 utils.check_lab_status(options.build) 1753 1754 afe = _create_afe(options) 1755 instance_server = afe.server 1756 1757 rpc_helper = diagnosis_utils.RPCHelper(afe) 1758 is_real_time = True 1759 if options.mock_job_id: 1760 job_id = int(options.mock_job_id) 1761 existing_job = afe.get_jobs(id=job_id, finished=True) 1762 if existing_job: 1763 is_real_time = False 1764 else: 1765 existing_job = afe.get_jobs(id=job_id) 1766 if existing_job: 1767 job_created_on = time_utils.date_string_to_epoch_time( 1768 existing_job[0].created_on) 1769 else: 1770 raise utils.TestLabException('Failed to retrieve job: %d' % job_id) 1771 else: 1772 try: 1773 rpc_helper.check_dut_availability(options.board, options.pool, 1774 options.minimum_duts, 1775 options.skip_duts_check) 1776 job_id = create_suite(afe, options) 1777 job_created_on = time.time() 1778 except (error.CrosDynamicSuiteException, 1779 error.RPCException, proxy.JSONRPCException) as e: 1780 logging.exception('Error Message: %s', e) 1781 return SuiteResult(RETURN_CODES.INFRA_FAILURE, 1782 {'return_message': str(e)}) 1783 except AttributeError as e: 1784 logging.exception('Error Message: %s', e) 1785 return SuiteResult(RETURN_CODES.INVALID_OPTIONS) 1786 1787 job_timer = diagnosis_utils.JobTimer( 1788 job_created_on, float(options.timeout_mins)) 1789 job_url = reporting_utils.link_job(job_id, 1790 instance_server=instance_server) 1791 logging.info('%s Created suite job: %s', 1792 job_timer.format_time(job_timer.job_created_time), 1793 job_url) 1794 logging.info(annotations.StepLink( 1795 text='Link to suite', 1796 url=job_url)) 1797 1798 if options.create_and_return: 1799 msg = '--create_and_return was specified, terminating now.' 1800 logging.info(msg) 1801 return SuiteResult(RETURN_CODES.OK, {'return_message': msg}) 1802 1803 if options.no_wait: 1804 return _handle_job_nowait(job_id, options, instance_server) 1805 else: 1806 return _handle_job_wait(afe, job_id, options, job_timer, is_real_time) 1807 1808 1809 def _get_log_name(options): 1810 """Return local log file's name. 1811 1812 @param options: Parsed options. 1813 1814 @return log_name, a string file name. 1815 """ 1816 if options.require_logfile: 1817 # options.build is verified to exist in verify_options. 1818 # convert build name from containing / to containing only _. 1819 log_name = 'run_suite-%s.log' % options.build.replace('/', '_') 1820 log_dir = os.path.join(common.autotest_dir, 'logs') 1821 if os.path.exists(log_dir): 1822 log_name = os.path.join(log_dir, log_name) 1823 1824 return log_name 1825 else: 1826 return None 1827 1828 1829 def _create_afe(options): 1830 """Return an afe instance based on options. 1831 1832 @param options Parsed options. 1833 1834 @return afe, an AFE instance. 1835 """ 1836 instance_server = (options.web if options.web else 1837 instance_for_pool(options.pool)) 1838 afe = frontend_wrappers.RetryingAFE(server=instance_server, 1839 timeout_min=options.afe_timeout_mins, 1840 delay_sec=options.delay_sec) 1841 logging.info('Autotest instance created: %s', instance_server) 1842 return afe 1843 1844 1845 def _handle_job_wait(afe, job_id, options, job_timer, is_real_time): 1846 """Handle suite job synchronously. 1847 1848 @param afe AFE instance. 1849 @param job_id Suite job id. 1850 @param options Parsed options. 1851 @param job_timer JobTimer for suite job. 1852 @param is_real_time Whether or not to handle job timeout. 1853 1854 @return SuiteResult of suite job. 1855 """ 1856 rpc_helper = diagnosis_utils.RPCHelper(afe) 1857 instance_server = afe.server 1858 while not afe.get_jobs(id=job_id, finished=True): 1859 _poke_buildbot_with_output(afe, job_id, job_timer) 1860 if job_timer.debug_output_timer.poll(): 1861 logging.info('The suite job has another %s till timeout.', 1862 job_timer.timeout_hours - job_timer.elapsed_time()) 1863 time.sleep(10) 1864 logging.info('%s Suite job is finished.', 1865 diagnosis_utils.JobTimer.format_time(datetime.now())) 1866 # For most cases, ResultCollector should be able to determine whether 1867 # a suite has timed out by checking information in the test view. 1868 # However, occationally tko parser may fail on parsing the 1869 # job_finished time from the job's keyval file. So we add another 1870 # layer of timeout check in run_suite. We do the check right after 1871 # the suite finishes to make it as accurate as possible. 1872 # There is a minor race condition here where we might have aborted 1873 # for some reason other than a timeout, and the job_timer thinks 1874 # it's a timeout because of the jitter in waiting for results. 1875 # The consequence would be that run_suite exits with code 1876 # SUITE_TIMEOUT while it should have returned INFRA_FAILURE 1877 # instead, which should happen very rarely. 1878 # Note the timeout will have no sense when using -m option. 1879 is_suite_timeout = job_timer.is_suite_timeout() 1880 1881 # Extract the original suite name to record timing. 1882 original_suite_name = get_original_suite_name(options.name, 1883 options.suite_args) 1884 # Start collecting test results. 1885 logging.info('%s Start collecting test results and dump them to json.', 1886 diagnosis_utils.JobTimer.format_time(datetime.now())) 1887 TKO = frontend_wrappers.RetryingTKO(server=instance_server, 1888 timeout_min=options.afe_timeout_mins, 1889 delay_sec=options.delay_sec) 1890 # TODO(crbug.com/672348): It needs to be possible for provision 1891 # suite to pass if only a few tests fail. Otherwise, a single 1892 # failing test will be reported as failure even if the suite reports 1893 # success. 1894 if options.name == _PROVISION_SUITE: 1895 # TODO(crbug.com/672348): Creating the suite job requires that 1896 # suite_args contains num_required. 1897 return_code_function = _ProvisionReturnCodeComputer( 1898 num_required=options.suite_args['num_required']) 1899 else: 1900 return_code_function = _ReturnCodeComputer() 1901 collector = ResultCollector(instance_server=instance_server, 1902 afe=afe, tko=TKO, build=options.build, 1903 board=options.board, 1904 suite_name=options.name, 1905 suite_job_id=job_id, 1906 return_code_function=return_code_function, 1907 original_suite_name=original_suite_name) 1908 collector.run() 1909 # Dump test outputs into json. 1910 output_dict = collector.get_results_dict() 1911 output_dict['autotest_instance'] = instance_server 1912 if not options.json_dump: 1913 collector.output_results() 1914 result = collector.return_result 1915 if is_real_time: 1916 # Do not record stats if the suite was aborted (either by a user 1917 # or through the golo rpc). 1918 # Also do not record stats if is_aborted is None, indicating 1919 # aborting status is unknown yet. 1920 if collector.is_aborted == False: 1921 logging.info('%s Gathering timing stats for the suite job.', 1922 diagnosis_utils.JobTimer.format_time(datetime.now())) 1923 collector.gather_timing_stats() 1924 1925 if collector.is_aborted == True and is_suite_timeout: 1926 # There are two possible cases when a suite times out. 1927 # 1. the suite job was aborted due to timing out 1928 # 2. the suite job succeeded, but some child jobs 1929 # were already aborted before the suite job exited. 1930 # The case 2 was handled by ResultCollector, 1931 # here we handle case 1. 1932 result |= _RETURN_RESULTS['suite_timeout'] 1933 logging.info('\n %s Attempting to display pool info: %s', 1934 diagnosis_utils.JobTimer.format_time(datetime.now()), 1935 options.pool) 1936 try: 1937 # Add some jitter to make up for any latency in 1938 # aborting the suite or checking for results. 1939 cutoff = job_timer.timeout_hours + timedelta(hours=0.3) 1940 rpc_helper.diagnose_pool( 1941 options.board, options.pool, cutoff) 1942 except proxy.JSONRPCException: 1943 logging.warning('Unable to display pool info.') 1944 1945 # And output return message. 1946 if result.message: 1947 logging.info('Reason: %s', result.message) 1948 1949 logging.info('\n %s Output below this line is for buildbot consumption:', 1950 diagnosis_utils.JobTimer.format_time(datetime.now())) 1951 log_buildbot_links(logging.info, collector.buildbot_links) 1952 return result.suite_result(output_dict) 1953 1954 1955 def _handle_job_nowait(job_id, options, instance_server): 1956 """Handle suite job asynchronously. 1957 1958 @param job_id Suite job id. 1959 @param options Parsed options. 1960 @param instance_server Autotest instance hostname. 1961 1962 @return SuiteResult of suite job. 1963 """ 1964 logging.info('Created suite job: %r', job_id) 1965 link = LogLink(options.name, instance_server, 1966 '%s-%s' % (job_id, getpass.getuser())) 1967 for generate_link in link.GenerateBuildbotLinks(): 1968 logging.info(generate_link) 1969 logging.info('--no_wait specified; Exiting.') 1970 return SuiteResult(RETURN_CODES.OK, 1971 {'return_message': '--no_wait specified; Exiting.'}) 1972 1973 1974 def _should_run(options): 1975 """Check whether the suite should be run based on lab/job status checking. 1976 1977 @param options Parsed options. 1978 """ 1979 try: 1980 site_utils.check_lab_status(options.test_source_build) 1981 except site_utils.TestLabException as ex: 1982 logging.exception('Lab is closed or build is blocked. Skipping ' 1983 'suite %s, board %s, build %s: %s', 1984 options.name, options.board, 1985 options.test_source_build, str(ex)) 1986 return False 1987 1988 start_time = str(datetime.now() - 1989 timedelta(days=_SEARCH_JOB_MAX_DAYS)) 1990 afe = _create_afe(options) 1991 afe_job_id = afe.get_jobs( 1992 name__istartswith=options.test_source_build, 1993 name__iendswith='control.'+options.name, 1994 created_on__gte=start_time, 1995 min_rpc_timeout=_MIN_RPC_TIMEOUT) 1996 if afe_job_id: 1997 logging.info('Found duplicate suite %s scheduled in past.', 1998 afe_job_id) 1999 return False 2000 2001 return True 2002 2003 2004 def _poke_buildbot_with_output(afe, job_id, job_timer): 2005 """Poke buildbot so it doesn't timeout from silence. 2006 2007 @param afe AFE instance. 2008 @param job_id Suite job id. 2009 @param job_timer JobTimer for suite job. 2010 """ 2011 rpc_helper = diagnosis_utils.RPCHelper(afe) 2012 # Note that this call logs output, preventing buildbot's 2013 # 9000 second silent timeout from kicking in. Let there be no 2014 # doubt, this is a hack. The timeout is from upstream buildbot and 2015 # this is the easiest work around. 2016 if job_timer.first_past_halftime(): 2017 rpc_helper.diagnose_job(job_id, afe.server) 2018 2019 2020 2021 def _run_task(options): 2022 """Perform this script's function minus setup. 2023 2024 Boilerplate like argument parsing, logging, output formatting happen 2025 elsewhere. 2026 2027 Returns a SuiteResult instance. 2028 2029 TODO(ayatane): The try/except should be moved into _run_suite(). 2030 Good luck trying to figure out which function calls are supposed to 2031 raise which of the exceptions. 2032 """ 2033 try: 2034 return _run_suite(options) 2035 except diagnosis_utils.BoardNotAvailableError as e: 2036 result = SuiteResult( 2037 RETURN_CODES.BOARD_NOT_AVAILABLE, 2038 {'return_message': 'Skipping testing: %s' % e.message}) 2039 logging.info(result.output_dict['return_message']) 2040 return result 2041 except utils.TestLabException as e: 2042 result = SuiteResult( 2043 RETURN_CODES.INFRA_FAILURE, 2044 {'return_message': 'TestLabException: %s' % e}) 2045 logging.exception(result.output_dict['return_message']) 2046 return result 2047 2048 2049 class _ExceptionHandler(object): 2050 """Global exception handler replacement.""" 2051 2052 def __init__(self, dump_json): 2053 """Initialize instance. 2054 2055 @param dump_json: Whether to print a JSON dump of the result dict to 2056 stdout. 2057 """ 2058 self._should_dump_json = dump_json 2059 2060 def __call__(self, exc_type, value, traceback): 2061 if self._should_dump_json: 2062 _dump_json({'return_message': ('Unhandled run_suite exception: %s' 2063 % value)}) 2064 sys.exit(RETURN_CODES.INFRA_FAILURE) 2065 2066 2067 def main(): 2068 """Entry point.""" 2069 utils.verify_not_root_user() 2070 2071 parser = make_parser() 2072 options = parser.parse_args() 2073 if options.do_nothing: 2074 return 0 2075 2076 sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump) 2077 if options.json_dump: 2078 logging.disable(logging.CRITICAL) 2079 2080 options_okay = verify_and_clean_options(options) 2081 # Set StreamHandler first to capture error messages if suite is not run. 2082 utils.setup_logging() 2083 if not options_okay: 2084 parser.print_help() 2085 result = SuiteResult(RETURN_CODES.INVALID_OPTIONS) 2086 elif options.pre_check and not _should_run(options): 2087 logging.info('Suite %s-%s is terminated: Lab is closed, OR build is ' 2088 'blocked, OR this suite has already been kicked off ' 2089 'once in past %d days.', 2090 options.test_source_build, options.name, 2091 _SEARCH_JOB_MAX_DAYS) 2092 result = SuiteResult( 2093 RETURN_CODES.ERROR, 2094 {'return_message': ("Lab is closed OR other reason" 2095 " (see code, it's complicated)")}) 2096 else: 2097 result = _run_task(options) 2098 2099 if options.json_dump: 2100 _dump_json(result.output_dict) 2101 2102 logging.info('Will return from run_suite with status: %s', 2103 RETURN_CODES.get_string(result.return_code)) 2104 return result.return_code 2105 2106 2107 def _dump_json(obj): 2108 """Write obj JSON to stdout.""" 2109 output_json = json.dumps(obj, sort_keys=True) 2110 sys.stdout.write('#JSON_START#%s#JSON_END#' % output_json.strip()) 2111 2112 2113 if __name__ == "__main__": 2114 sys.exit(main()) 2115