1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import datetime 6 import logging 7 8 import common 9 10 from autotest_lib.client.common_lib import base_job 11 from autotest_lib.client.common_lib import error 12 from autotest_lib.client.common_lib import priorities 13 from autotest_lib.client.common_lib import time_utils 14 from autotest_lib.client.common_lib import utils 15 from autotest_lib.client.common_lib.cros import dev_server 16 from autotest_lib.server.cros import provision 17 from autotest_lib.server.cros.dynamic_suite import constants 18 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 19 from autotest_lib.server.cros.dynamic_suite import tools 20 from autotest_lib.server.cros.dynamic_suite.suite import Suite 21 from autotest_lib.tko import utils as tko_utils 22 23 24 25 """CrOS dynamic test suite generation and execution module. 26 27 This module implements runtime-generated test suites for CrOS. 28 Design doc: http://goto.google.com/suitesv2 29 30 Individual tests can declare themselves as a part of one or more 31 suites, and the code here enables control files to be written 32 that can refer to these "dynamic suites" by name. We also provide 33 support for reimaging devices with a given build and running a 34 dynamic suite across all reimaged devices. 35 36 The public API for defining a suite includes one method: reimage_and_run(). 37 A suite control file can be written by importing this module and making 38 an appropriate call to this single method. In normal usage, this control 39 file will be run in a 'hostless' server-side autotest job, scheduling 40 sub-jobs to do the needed reimaging and test running. 41 42 Example control file: 43 44 import common 45 from autotest_lib.server.cros import provision 46 from autotest_lib.server.cros.dynamic_suite import dynamic_suite 47 48 dynamic_suite.reimage_and_run( 49 build=build, board=board, name='bvt', job=job, pool=pool, 50 check_hosts=check_hosts, add_experimental=True, num=num, 51 devserver_url=devserver_url, version_prefix=provision.CROS_VERSION_PREFIX) 52 53 This will -- at runtime -- find all control files that contain "bvt" in their 54 "SUITE=" clause, schedule jobs to reimage |num| or less devices in the 55 specified pool of the specified board with the specified build and, upon 56 completion of those jobs, schedule and wait for jobs that run all the tests it 57 discovered. 58 59 Suites can be run by using the atest command-line tool: 60 atest suite create -b <board> -i <build/name> <suite> 61 e.g. 62 atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt 63 64 ------------------------------------------------------------------------- 65 Implementation details 66 67 A Suite instance represents a single test suite, defined by some predicate 68 run over all known control files. The simplest example is creating a Suite 69 by 'name'. 70 71 create_suite_job() takes the parameters needed to define a suite run (board, 72 build to test, machine pool, and which suite to run), ensures important 73 preconditions are met, finds the appropraite suite control file, and then 74 schedules the hostless job that will do the rest of the work. 75 76 Note that we have more than one Dev server in our test lab architecture. 77 We currently load balance per-build being tested, so one and only one dev 78 server is used by any given run through the reimaging/testing flow. 79 80 - create_suite_job() 81 The primary role of create_suite_job() is to ensure that the required 82 artifacts for the build to be tested are staged on the dev server. This 83 includes payloads required to autoupdate machines to the desired build, as 84 well as the autotest control files appropriate for that build. Then, the 85 RPC pulls the control file for the suite to be run from the dev server and 86 uses it to create the suite job with the autotest frontend. 87 88 +----------------+ 89 | Google Storage | Client 90 +----------------+ | 91 | ^ | create_suite_job() 92 payloads/ | | | 93 control files | | request | 94 V | V 95 +-------------+ download request +--------------------------+ 96 | |<----------------------| | 97 | Dev Server | | Autotest Frontend (AFE) | 98 | |---------------------->| | 99 +-------------+ suite control file +--------------------------+ 100 | 101 V 102 Suite Job (hostless) 103 104 - Reimage and Run 105 The overall process is to schedule all the tests, and then wait for the tests 106 to complete. 107 108 - The Reimaging Process 109 110 As an artifact of an old implementation, the number of machines to use 111 is called the 'sharding_factor', and the default is defined in the [CROS] 112 section of global_config.ini. This can be overridden by passing a 'num=N' 113 parameter to create_suite_job(), which is piped through to reimage_and_run() 114 just like the 'build' and 'board' parameters are. However, with provisioning, 115 this machine accounting hasn't been implemented nor removed. However, 'num' is 116 still passed around, as it might be used one day. 117 118 A test control file can specify a list of DEPENDENCIES, which are really just 119 the set of labels a host needs to have in order for that test to be scheduled 120 on it. In the case of a dynamic_suite, many tests in the suite may have 121 DEPENDENCIES specified. All tests are scheduled with the DEPENDENCIES that 122 they specify, along with any suite dependencies that were specified, and the 123 scheduler will find and provision a host capable of running the test. 124 125 - Scheduling Suites 126 A Suite instance uses the labels specified in the suite dependencies to 127 schedule tests across all the hosts in the pool. It then waits for all these 128 jobs. As an optimization, the Dev server stages the payloads necessary to 129 run a suite in the background _after_ it has completed all the things 130 necessary for reimaging. Before running a suite, reimage_and_run() calls out 131 to the Dev server and blocks until it's completed staging all build artifacts 132 needed to run test suites. 133 134 Step by step: 135 0) At instantiation time, find all appropriate control files for this suite 136 that were included in the build to be tested. To do this, we consult the 137 Dev Server, where all these control files are staged. 138 139 +------------+ control files? +--------------------------+ 140 | |<----------------------| | 141 | Dev Server | | Autotest Frontend (AFE) | 142 | |---------------------->| [Suite Job] | 143 +------------+ control files! +--------------------------+ 144 145 1) Now that the Suite instance exists, it schedules jobs for every control 146 file it deemed appropriate, to be run on the hosts that were labeled 147 by the provisioning. We stuff keyvals into these jobs, indicating what 148 build they were testing and which suite they were for. 149 150 +--------------------------+ Job for VersLabel +--------+ 151 | |------------------------>| Host 1 | VersLabel 152 | Autotest Frontend (AFE) | +--------+ +--------+ 153 | [Suite Job] |----------->| Host 2 | 154 +--------------------------+ Job for +--------+ 155 | ^ VersLabel VersLabel 156 | | 157 +----------------+ 158 One job per test 159 {'build': build/name, 160 'suite': suite_name} 161 162 2) Now that all jobs are scheduled, they'll be doled out as labeled hosts 163 finish their assigned work and become available again. 164 165 - Waiting on Suites 166 0) As we clean up each test job, we check to see if any crashes occurred. If 167 they did, we look at the 'build' keyval in the job to see which build's debug 168 symbols we'll need to symbolicate the crash dump we just found. 169 170 1) Using this info, we tell a special Crash Server to stage the required debug 171 symbols. Once that's done, we ask the Crash Server to use those symbols to 172 symbolicate the crash dump in question. 173 174 +----------------+ 175 | Google Storage | 176 +----------------+ 177 | ^ 178 symbols! | | symbols? 179 V | 180 +------------+ stage symbols for build +--------------------------+ 181 | |<--------------------------| | 182 | Crash | | | 183 | Server | dump to symbolicate | Autotest Frontend (AFE) | 184 | |<--------------------------| [Suite Job] | 185 | |-------------------------->| | 186 +------------+ symbolicated dump +--------------------------+ 187 188 2) As jobs finish, we record their success or failure in the status of the suite 189 job. We also record a 'job keyval' in the suite job for each test, noting 190 the job ID and job owner. This can be used to refer to test logs later. 191 3) Once all jobs are complete, status is recorded for the suite job, and the 192 job_repo_url host attribute is removed from all hosts used by the suite. 193 194 """ 195 196 197 DEFAULT_TRY_JOB_TIMEOUT_MINS = tools.try_job_timeout_mins() 198 199 # Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py. 200 201 class SuiteSpec(object): 202 """ 203 This class contains the info that defines a suite run. 204 205 Currently required: 206 @var build: the build to install e.g. 207 x86-alex-release/R18-1655.0.0-a1-b1584. 208 @var board: which kind of devices to reimage. 209 @var devserver: An instance of the devserver to use with this suite. 210 @var name: a value of the SUITE control file variable to search for. 211 @var job: an instance of client.common_lib.base_job representing the 212 currently running suite job. 213 214 Currently supported optional fields: 215 @var pool: specify the pool of machines to use for scheduling purposes. 216 Default: None 217 @var num: the maximum number of devices to reimage. 218 Default in global_config 219 @var check_hosts: require appropriate hosts to be available now. 220 @var add_experimental: schedule experimental tests as well, or not. 221 Default: True 222 @var dependencies: map of test names to dependency lists. 223 Initially {'': []}. 224 @param suite_dependencies: A string with a comma separated list of suite 225 level dependencies, which act just like test 226 dependencies and are appended to each test's 227 set of dependencies at job creation time. 228 @param predicate: Optional argument. If present, should be a function 229 mapping ControlData objects to True if they should be 230 included in suite. If argument is absent, suite 231 behavior will default to creating a suite of based 232 on the SUITE field of control files. 233 """ 234 235 def _verify_builds(self, build, builds): 236 """Verify the value of build and builds passed in to create a suite. 237 238 TODO(crbug.com/496782): This method should be removed after R45 falls 239 off stable channel. Add `builds` to required_keywords in __init__, and 240 remove `build` in __init__. 241 242 @param build: the build to install e.g. 243 x86-alex-release/R18-1655.0.0-a1-b1584. 244 @param builds: the builds to install e.g. 245 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 246 'fw-version:': 'x86-alex-firmware/R36-5771.50.0'} 247 248 @raise: SuiteArgumentException if value for build or builds is invalid. 249 250 """ 251 if not builds and not build: 252 raise error.SuiteArgumentException( 253 'reimage_and_run() needs at least one of builds or build ' 254 'being specified.') 255 if build and builds and not build in builds.values(): 256 raise error.SuiteArgumentException( 257 'Arguments build and builds for reimage_and_run() is ' 258 'inconsistent. `build` must be one of the values of ' 259 '`builds`. build="%s". builds="%s"' % (build, builds)) 260 build_arg_check = {'build': str, 'builds': dict} 261 for key, expected in build_arg_check.iteritems(): 262 value = locals().get(key) 263 if value and not isinstance(value, expected): 264 raise error.SuiteArgumentException( 265 'reimage_and_run() needs %s=<%r>' % (key, expected)) 266 267 268 def __init__(self, build=None, builds=None, board=None, name=None, job=None, 269 pool=None, num=None, check_hosts=True, 270 add_experimental=True, file_bugs=False, 271 file_experimental_bugs=False, max_runtime_mins=24*60, 272 timeout=24, timeout_mins=None, firmware_reimage=False, 273 suite_dependencies=[], version_prefix=None, 274 bug_template={}, devserver_url=None, 275 priority=priorities.Priority.DEFAULT, predicate=None, 276 wait_for_results=True, job_retry=False, max_retries=None, 277 offload_failures_only=False, test_source_build=None, 278 run_prod_code=False, **dargs): 279 """ 280 Vets arguments for reimage_and_run() and populates self with supplied 281 values. 282 283 TODO(dshi): crbug.com/496782 once R45 falls off stable channel, we 284 should remove option build, firmware_reimage and version_prefix, as they 285 will be all merged into option builds. 286 287 Currently required args: 288 @param board: which kind of devices to reimage. 289 @param name: a value of the SUITE control file variable to search for. 290 @param job: an instance of client.common_lib.base_job representing the 291 currently running suite job. 292 @param devserver_url: url to the selected devserver. 293 294 Currently supported optional args: 295 @param build: the build to install e.g. 296 x86-alex-release/R18-1655.0.0-a1-b1584. 297 @param builds: the builds to install e.g. 298 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 299 'fw-version:': 'x86-alex-firmware/R36-5771.50.0'} 300 @param test_source_build: Build that contains the server-side test code, 301 e.g., it can be the value of builds['cros-version:'] or 302 builds['fw-version:']. Default is None, that is, use 303 the server-side test code from builds['cros-version:'] 304 @param pool: specify the pool of machines to use for scheduling purposes 305 Default: None 306 @param num: the maximum number of devices to reimage. 307 Default in global_config 308 @param check_hosts: require appropriate hosts to be available now. 309 @param add_experimental: schedule experimental tests as well, or not. 310 Default: True 311 @param file_bugs: File bugs when tests in this suite fail. 312 Default: False 313 @param file_experimental_bugs: File bugs when experimental tests in 314 this suite fail. 315 Default: False 316 @param max_runtime_mins: Max runtime in mins for each of the sub-jobs 317 this suite will run. 318 @param timeout: Max lifetime in hours for each of the sub-jobs that 319 this suite run. 320 @param firmware_reimage: True if we should use FW_RW_VERSION_PREFIX as 321 the version_prefix. 322 False if we should use CROS_VERSION_PREFIX as 323 the version_prefix. 324 (This flag has now been deprecated in favor of 325 version_prefix.) 326 @param suite_dependencies: A list of strings of suite level 327 dependencies, which act just like test 328 dependencies and are appended to each test's 329 set of dependencies at job creation time. 330 A string of comma seperated labels is 331 accepted for backwards compatibility. 332 @param bug_template: A template dictionary specifying the default bug 333 filing options for failures in this suite. 334 @param version_prefix: A version prefix from provision.py that the 335 tests should be scheduled with. 336 @param priority: Integer priority level. Higher is more important. 337 @param predicate: Optional argument. If present, should be a function 338 mapping ControlData objects to True if they should be 339 included in suite. If argument is absent, suite 340 behavior will default to creating a suite of based 341 on the SUITE field of control files. 342 @param wait_for_results: Set to False to run the suite job without 343 waiting for test jobs to finish. Default is 344 True. 345 @param job_retry: Set to True to enable job-level retry. Default is 346 False. 347 @param max_retries: Maximum retry limit at suite level. 348 Regardless how many times each individual test 349 has been retried, the total number of retries 350 happening in the suite can't exceed _max_retries. 351 Default to None, no max. 352 @param offload_failures_only: Only enable gs_offloading for failed 353 jobs. 354 @param run_prod_code: If true, the suite will run the test code that 355 lives in prod aka the test code currently on the 356 lab servers. 357 @param **dargs: these arguments will be ignored. This allows us to 358 deprecate and remove arguments in ToT while not 359 breaking branch builds. 360 """ 361 # TODO(dshi): crbug.com/496782 Following should be added to 362 # required_keywords after R45 falls off stable channel: 363 # 'builds': dict, 364 # To allow the transition, build is removed from the list, but the code 365 # will check either build or builds should exist. 366 required_keywords = {'board': str, 367 'name': str, 368 'job': base_job.base_job, 369 'devserver_url': str} 370 for key, expected in required_keywords.iteritems(): 371 value = locals().get(key) 372 if not value or not isinstance(value, expected): 373 raise error.SuiteArgumentException( 374 'reimage_and_run() needs %s=<%r>' % (key, expected)) 375 self._verify_builds(build, builds) 376 377 self.board = 'board:%s' % board 378 self.devserver = dev_server.ImageServer(devserver_url) 379 380 if builds: 381 self.builds = builds 382 else: 383 # TODO(dshi): crbug.com/496782 This warning can be removed after R45 384 # falls off stable channel. 385 logging.warning('reimage_and_run arguments firmware_reimage and ' 386 'version_prefix have been deprecated. Please use ' 387 'a dictionary builds to specify images, e.g., ' 388 '{\'cros-version:\':\'peppy-release/R38-5655.0.0\',' 389 ' \'fw-version:\':\'peppy-firmware/R36-5371.0.0\'}') 390 391 if version_prefix: 392 prefix = version_prefix 393 else: 394 prefix = (provision.FW_RW_VERSION_PREFIX if firmware_reimage 395 else provision.CROS_VERSION_PREFIX) 396 self.builds = {prefix: build} 397 398 if provision.CROS_VERSION_PREFIX in self.builds: 399 translated_build = self.devserver.translate( 400 self.builds[provision.CROS_VERSION_PREFIX]) 401 self.builds[provision.CROS_VERSION_PREFIX] = translated_build 402 403 if test_source_build: 404 test_source_build = self.devserver.translate(test_source_build) 405 406 self.test_source_build = Suite.get_test_source_build( 407 self.builds, test_source_build=test_source_build) 408 409 self.name = name 410 self.job = job 411 if pool: 412 self.pool = 'pool:%s' % pool 413 else: 414 self.pool = pool 415 self.num = num 416 self.check_hosts = check_hosts 417 self.skip_reimage = skip_reimage 418 self.add_experimental = add_experimental 419 self.file_bugs = file_bugs 420 self.file_experimental_bugs = file_experimental_bugs 421 self.dependencies = {'': []} 422 self.max_runtime_mins = max_runtime_mins 423 self.timeout = timeout 424 self.timeout_mins = timeout_mins or timeout * 60 425 if isinstance(suite_dependencies, str): 426 self.suite_dependencies = [dep.strip(' ') for dep 427 in suite_dependencies.split(',')] 428 else: 429 self.suite_dependencies = suite_dependencies 430 self.bug_template = bug_template 431 self.priority = priority 432 self.predicate = predicate 433 self.wait_for_results = wait_for_results 434 self.job_retry = job_retry 435 self.max_retries = max_retries 436 self.offload_failures_only = offload_failures_only 437 self.run_prod_code = run_prod_code 438 439 440 def skip_reimage(g): 441 """ 442 Pulls the SKIP_IMAGE value out of a global variables dictionary. 443 @param g: The global variables dictionary. 444 @return: Value associated with SKIP-IMAGE 445 """ 446 return False 447 448 449 def reimage_and_run(**dargs): 450 """ 451 Backward-compatible API for dynamic_suite. 452 453 Will re-image a number of devices (of the specified board) with the 454 provided build, and then run the indicated test suite on them. 455 Guaranteed to be compatible with any build from stable to dev. 456 457 @param dargs: Dictionary containing the arguments listed below. 458 459 Currently required args: 460 @param board: which kind of devices to reimage. 461 @param name: a value of the SUITE control file variable to search for. 462 @param job: an instance of client.common_lib.base_job representing the 463 currently running suite job. 464 465 Currently supported optional args: 466 @param build: the build to install e.g. 467 x86-alex-release/R18-1655.0.0-a1-b1584. 468 @param builds: the builds to install e.g. 469 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 470 'fw-version:': 'x86-alex-firmware/R36-5771.50.0'} 471 @param pool: specify the pool of machines to use for scheduling purposes. 472 Default: None 473 @param num: the maximum number of devices to reimage. 474 Default in global_config 475 @param check_hosts: require appropriate hosts to be available now. 476 @param add_experimental: schedule experimental tests as well, or not. 477 Default: True 478 @param file_bugs: automatically file bugs on test failures. 479 Default: False 480 @param suite_dependencies: A string with a comma separated list of suite 481 level dependencies, which act just like test 482 dependencies and are appended to each test's 483 set of dependencies at job creation time. 484 @param devserver_url: url to the selected devserver. 485 @param predicate: Optional argument. If present, should be a function 486 mapping ControlData objects to True if they should be 487 included in suite. If argument is absent, suite 488 behavior will default to creating a suite of based 489 on the SUITE field of control files. 490 @param job_retry: A bool value indicating whether jobs should be retired 491 on failure. If True, the field 'JOB_RETRIES' in control 492 files will be respected. If False, do not retry. 493 @param max_retries: Maximum retry limit at suite level. 494 Regardless how many times each individual test 495 has been retried, the total number of retries 496 happening in the suite can't exceed _max_retries. 497 Default to None, no max. 498 @param offload_failures_only: Only enable gs_offloading for failed jobs. 499 @raises AsynchronousBuildFailure: if there was an issue finishing staging 500 from the devserver. 501 @raises MalformedDependenciesException: if the dependency_info file for 502 the required build fails to parse. 503 """ 504 suite_spec = SuiteSpec(**dargs) 505 506 # To support provision both CrOS and firmware, option builds is added to 507 # SuiteSpec, e.g., 508 # builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0', 509 # 'fw-version:': 'x86-alex-firmware/R36-5771.50.0'} 510 # Option build, version_prefix and firmware_reimage will all be obsoleted. 511 # For backwards compatibility, these option will be default to 512 # firmware_reimage = False 513 # version_prefix = provision.CROS_VERSION_PREFIX 514 # build will be used as CrOS build 515 suite_spec.firmware_reimage = False 516 # </backwards_compatibility_hacks> 517 518 # version_prefix+build should make it into each test as a DEPENDENCY. The 519 # easiest way to do this is to tack it onto the suite_dependencies. 520 suite_spec.suite_dependencies.extend( 521 provision.join(version_prefix, build) 522 for version_prefix, build in suite_spec.builds.items()) 523 524 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 525 user=suite_spec.job.user, debug=False) 526 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 527 user=suite_spec.job.user, debug=False) 528 529 try: 530 my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag)) 531 logging.debug('Determined own job id: %d', my_job_id) 532 except ValueError: 533 my_job_id = None 534 logging.warning('Could not determine own job id.') 535 536 if suite_spec.predicate is None: 537 predicate = Suite.name_in_tag_predicate(suite_spec.name) 538 else: 539 predicate = suite_spec.predicate 540 541 _perform_reimage_and_run(suite_spec, afe, tko, 542 predicate, suite_job_id=my_job_id) 543 544 logging.debug('Returning from dynamic_suite.reimage_and_run.') 545 546 547 def _perform_reimage_and_run(spec, afe, tko, predicate, suite_job_id=None): 548 """ 549 Do the work of reimaging hosts and running tests. 550 551 @param spec: a populated SuiteSpec object. 552 @param afe: an instance of AFE as defined in server/frontend.py. 553 @param tko: an instance of TKO as defined in server/frontend.py. 554 @param predicate: A function mapping ControlData objects to True if they 555 should be included in the suite. 556 @param suite_job_id: Job id that will act as parent id to all sub jobs. 557 Default: None 558 """ 559 # We can't do anything else until the devserver has finished downloading 560 # control_files and test_suites packages so that we can get the control 561 # files we should schedule. 562 try: 563 if not spec.run_prod_code: 564 spec.devserver.stage_artifacts(spec.test_source_build, 565 ['control_files', 'test_suites']) 566 except dev_server.DevServerException as e: 567 # If we can't get the control files, there's nothing to run. 568 raise error.AsynchronousBuildFailure(e) 569 570 timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT) 571 utils.write_keyval( 572 spec.job.resultdir, 573 {constants.ARTIFACT_FINISHED_TIME: timestamp}) 574 575 suite = Suite.create_from_predicates( 576 predicates=[predicate], name=spec.name, 577 builds=spec.builds, board=spec.board, devserver=spec.devserver, 578 afe=afe, tko=tko, pool=spec.pool, 579 results_dir=spec.job.resultdir, 580 max_runtime_mins=spec.max_runtime_mins, timeout_mins=spec.timeout_mins, 581 file_bugs=spec.file_bugs, 582 file_experimental_bugs=spec.file_experimental_bugs, 583 suite_job_id=suite_job_id, extra_deps=spec.suite_dependencies, 584 priority=spec.priority, wait_for_results=spec.wait_for_results, 585 job_retry=spec.job_retry, max_retries=spec.max_retries, 586 offload_failures_only=spec.offload_failures_only, 587 test_source_build=spec.test_source_build, 588 run_prod_code=spec.run_prod_code) 589 590 # Now we get to asychronously schedule tests. 591 suite.schedule(spec.job.record_entry, spec.add_experimental) 592 593 if suite.wait_for_results: 594 logging.debug('Waiting on suite.') 595 suite.wait(spec.job.record_entry, spec.bug_template) 596 logging.debug('Finished waiting on suite. ' 597 'Returning from _perform_reimage_and_run.') 598 else: 599 logging.info('wait_for_results is set to False, suite job will exit ' 600 'without waiting for test jobs to finish.') 601