1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import datetime 6 import logging 7 import time 8 9 import common 10 11 from autotest_lib.client.common_lib import base_job 12 from autotest_lib.client.common_lib import error 13 from autotest_lib.client.common_lib import priorities 14 from autotest_lib.client.common_lib import time_utils 15 from autotest_lib.client.common_lib import utils 16 from autotest_lib.client.common_lib.cros import dev_server 17 from autotest_lib.server.cros import provision 18 from autotest_lib.server.cros.dynamic_suite import constants 19 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 20 from autotest_lib.server.cros.dynamic_suite import tools 21 from autotest_lib.server.cros.dynamic_suite.suite import Suite 22 from autotest_lib.tko import utils as tko_utils 23 24 25 """CrOS dynamic test suite generation and execution module. 26 27 This module implements runtime-generated test suites for CrOS. 28 Design doc: http://goto.google.com/suitesv2 29 30 Individual tests can declare themselves as a part of one or more 31 suites, and the code here enables control files to be written 32 that can refer to these "dynamic suites" by name. We also provide 33 support for reimaging devices with a given build and running a 34 dynamic suite across all reimaged devices. 35 36 The public API for defining a suite includes one method: reimage_and_run(). 37 A suite control file can be written by importing this module and making 38 an appropriate call to this single method. In normal usage, this control 39 file will be run in a 'hostless' server-side autotest job, scheduling 40 sub-jobs to do the needed reimaging and test running. 41 42 Example control file: 43 44 import common 45 from autotest_lib.server.cros import provision 46 from autotest_lib.server.cros.dynamic_suite import dynamic_suite 47 48 dynamic_suite.reimage_and_run( 49 builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt', 50 job=job, pool=pool, check_hosts=check_hosts, add_experimental=True, num=num, 51 devserver_url=devserver_url) 52 53 This will -- at runtime -- find all control files that contain "bvt" in their 54 "SUITE=" clause, schedule jobs to reimage |num| or less devices in the 55 specified pool of the specified board with the specified build and, upon 56 completion of those jobs, schedule and wait for jobs that run all the tests it 57 discovered. 58 59 Suites can be run by using the atest command-line tool: 60 atest suite create -b <board> -i <build/name> <suite> 61 e.g. 62 atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt 63 64 ------------------------------------------------------------------------- 65 Implementation details 66 67 A Suite instance represents a single test suite, defined by some predicate 68 run over all known control files. The simplest example is creating a Suite 69 by 'name'. 70 71 create_suite_job() takes the parameters needed to define a suite run (board, 72 build to test, machine pool, and which suite to run), ensures important 73 preconditions are met, finds the appropraite suite control file, and then 74 schedules the hostless job that will do the rest of the work. 75 76 Note that we have more than one Dev server in our test lab architecture. 77 We currently load balance per-build being tested, so one and only one dev 78 server is used by any given run through the reimaging/testing flow. 79 80 - create_suite_job() 81 The primary role of create_suite_job() is to ensure that the required 82 artifacts for the build to be tested are staged on the dev server. This 83 includes payloads required to autoupdate machines to the desired build, as 84 well as the autotest control files appropriate for that build. Then, the 85 RPC pulls the control file for the suite to be run from the dev server and 86 uses it to create the suite job with the autotest frontend. 87 88 +----------------+ 89 | Google Storage | Client 90 +----------------+ | 91 | ^ | create_suite_job() 92 payloads/ | | | 93 control files | | request | 94 V | V 95 +-------------+ download request +--------------------------+ 96 | |<----------------------| | 97 | Dev Server | | Autotest Frontend (AFE) | 98 | |---------------------->| | 99 +-------------+ suite control file +--------------------------+ 100 | 101 V 102 Suite Job (hostless) 103 104 - Reimage and Run 105 The overall process is to schedule all the tests, and then wait for the tests 106 to complete. 107 108 - The Reimaging Process 109 110 As an artifact of an old implementation, the number of machines to use 111 is called the 'sharding_factor', and the default is defined in the [CROS] 112 section of global_config.ini. This can be overridden by passing a 'num=N' 113 parameter to create_suite_job(), which is piped through to reimage_and_run() 114 just like the 'build' and 'board' parameters are. However, with provisioning, 115 this machine accounting hasn't been implemented nor removed. However, 'num' is 116 still passed around, as it might be used one day. 117 118 A test control file can specify a list of DEPENDENCIES, which are really just 119 the set of labels a host needs to have in order for that test to be scheduled 120 on it. In the case of a dynamic_suite, many tests in the suite may have 121 DEPENDENCIES specified. All tests are scheduled with the DEPENDENCIES that 122 they specify, along with any suite dependencies that were specified, and the 123 scheduler will find and provision a host capable of running the test. 124 125 - Scheduling Suites 126 A Suite instance uses the labels specified in the suite dependencies to 127 schedule tests across all the hosts in the pool. It then waits for all these 128 jobs. As an optimization, the Dev server stages the payloads necessary to 129 run a suite in the background _after_ it has completed all the things 130 necessary for reimaging. Before running a suite, reimage_and_run() calls out 131 to the Dev server and blocks until it's completed staging all build artifacts 132 needed to run test suites. 133 134 Step by step: 135 0) At instantiation time, find all appropriate control files for this suite 136 that were included in the build to be tested. To do this, we consult the 137 Dev Server, where all these control files are staged. 138 139 +------------+ control files? +--------------------------+ 140 | |<----------------------| | 141 | Dev Server | | Autotest Frontend (AFE) | 142 | |---------------------->| [Suite Job] | 143 +------------+ control files! +--------------------------+ 144 145 1) Now that the Suite instance exists, it schedules jobs for every control 146 file it deemed appropriate, to be run on the hosts that were labeled 147 by the provisioning. We stuff keyvals into these jobs, indicating what 148 build they were testing and which suite they were for. 149 150 +--------------------------+ Job for VersLabel +--------+ 151 | |------------------------>| Host 1 | VersLabel 152 | Autotest Frontend (AFE) | +--------+ +--------+ 153 | [Suite Job] |----------->| Host 2 | 154 +--------------------------+ Job for +--------+ 155 | ^ VersLabel VersLabel 156 | | 157 +----------------+ 158 One job per test 159 {'build': build/name, 160 'suite': suite_name} 161 162 2) Now that all jobs are scheduled, they'll be doled out as labeled hosts 163 finish their assigned work and become available again. 164 165 - Waiting on Suites 166 0) As we clean up each test job, we check to see if any crashes occurred. If 167 they did, we look at the 'build' keyval in the job to see which build's debug 168 symbols we'll need to symbolicate the crash dump we just found. 169 170 1) Using this info, we tell a special Crash Server to stage the required debug 171 symbols. Once that's done, we ask the Crash Server to use those symbols to 172 symbolicate the crash dump in question. 173 174 +----------------+ 175 | Google Storage | 176 +----------------+ 177 | ^ 178 symbols! | | symbols? 179 V | 180 +------------+ stage symbols for build +--------------------------+ 181 | |<--------------------------| | 182 | Crash | | | 183 | Server | dump to symbolicate | Autotest Frontend (AFE) | 184 | |<--------------------------| [Suite Job] | 185 | |-------------------------->| | 186 +------------+ symbolicated dump +--------------------------+ 187 188 2) As jobs finish, we record their success or failure in the status of the suite 189 job. We also record a 'job keyval' in the suite job for each test, noting 190 the job ID and job owner. This can be used to refer to test logs later. 191 3) Once all jobs are complete, status is recorded for the suite job, and the 192 job_repo_url host attribute is removed from all hosts used by the suite. 193 194 """ 195 196 197 DEFAULT_TRY_JOB_TIMEOUT_MINS = tools.try_job_timeout_mins() 198 199 # Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py. 200 201 class SuiteSpec(object): 202 """ 203 This class contains the info that defines a suite run. 204 205 Currently required: 206 @var build: the build to install e.g. 207 x86-alex-release/R18-1655.0.0-a1-b1584. 208 @var board: which kind of devices to reimage. 209 @var devserver: An instance of the devserver to use with this suite. 210 @var name: a value of the SUITE control file variable to search for. 211 @var job: an instance of client.common_lib.base_job representing the 212 currently running suite job. 213 214 Currently supported optional fields: 215 @var pool: specify the pool of machines to use for scheduling purposes. 216 Default: None 217 @var num: the maximum number of devices to reimage. 218 Default in global_config 219 @var check_hosts: require appropriate hosts to be available now. 220 @var add_experimental: schedule experimental tests as well, or not. 221 Default: True 222 @var dependencies: map of test names to dependency lists. 223 Initially {'': []}. 224 @param suite_dependencies: A string with a comma separated list of suite 225 level dependencies, which act just like test 226 dependencies and are appended to each test's 227 set of dependencies at job creation time. 228 @param predicate: Optional argument. If present, should be a function 229 mapping ControlData objects to True if they should be 230 included in suite. If argument is absent, suite 231 behavior will default to creating a suite of based 232 on the SUITE field of control files. 233 @param test_args: A dict of args passed all the way to each individual test 234 that will be actually ran. 235 """ 236 237 _REQUIRED_KEYWORDS = { 238 'board': str, 239 'builds': dict, 240 'name': str, 241 'job': base_job.base_job, 242 'devserver_url': str, 243 } 244 245 _VERSION_PREFIXES = frozenset(( 246 provision.CROS_VERSION_PREFIX, 247 provision.ANDROID_BUILD_VERSION_PREFIX, 248 )) 249 250 def __init__( 251 self, 252 builds=None, 253 board=None, 254 name=None, 255 job=None, 256 pool=None, 257 num=None, 258 check_hosts=True, 259 add_experimental=True, 260 file_bugs=False, 261 file_experimental_bugs=False, 262 max_runtime_mins=24*60, 263 timeout=24, 264 timeout_mins=None, 265 suite_dependencies=None, 266 bug_template=None, 267 devserver_url=None, 268 priority=priorities.Priority.DEFAULT, 269 predicate=None, 270 wait_for_results=True, 271 job_retry=False, 272 max_retries=None, 273 offload_failures_only=False, 274 test_source_build=None, 275 run_prod_code=False, 276 delay_minutes=0, 277 job_keyvals=None, 278 test_args = None, 279 **dargs): 280 """ 281 Vets arguments for reimage_and_run() and populates self with supplied 282 values. 283 284 Currently required args: 285 @param board: which kind of devices to reimage. 286 @param name: a value of the SUITE control file variable to search for. 287 @param job: an instance of client.common_lib.base_job representing the 288 currently running suite job. 289 @param devserver_url: url to the selected devserver. 290 @param builds: the builds to install e.g. 291 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 292 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 293 294 Currently supported optional args: 295 @param test_source_build: Build that contains the server-side test code, 296 e.g., it can be the value of builds['cros-version:'] or 297 builds['fw-version:']. Default is None, that is, use 298 the server-side test code from builds['cros-version:'] 299 @param pool: specify the pool of machines to use for scheduling purposes 300 Default: None 301 @param num: the maximum number of devices to reimage. 302 Default in global_config 303 @param check_hosts: require appropriate hosts to be available now. 304 @param add_experimental: schedule experimental tests as well, or not. 305 Default: True 306 @param file_bugs: File bugs when tests in this suite fail. 307 Default: False 308 @param file_experimental_bugs: File bugs when experimental tests in 309 this suite fail. 310 Default: False 311 @param max_runtime_mins: Max runtime in mins for each of the sub-jobs 312 this suite will run. 313 @param timeout: Max lifetime in hours for each of the sub-jobs that 314 this suite run. 315 @param suite_dependencies: A list of strings of suite level 316 dependencies, which act just like test 317 dependencies and are appended to each test's 318 set of dependencies at job creation time. 319 A string of comma seperated labels is 320 accepted for backwards compatibility. 321 @param bug_template: A template dictionary specifying the default bug 322 filing options for failures in this suite. 323 @param priority: Integer priority level. Higher is more important. 324 @param predicate: Optional argument. If present, should be a function 325 mapping ControlData objects to True if they should be 326 included in suite. If argument is absent, suite 327 behavior will default to creating a suite of based 328 on the SUITE field of control files. 329 @param wait_for_results: Set to False to run the suite job without 330 waiting for test jobs to finish. Default is 331 True. 332 @param job_retry: Set to True to enable job-level retry. Default is 333 False. 334 @param max_retries: Maximum retry limit at suite level. 335 Regardless how many times each individual test 336 has been retried, the total number of retries 337 happening in the suite can't exceed _max_retries. 338 Default to None, no max. 339 @param offload_failures_only: Only enable gs_offloading for failed 340 jobs. 341 @param run_prod_code: If true, the suite will run the test code that 342 lives in prod aka the test code currently on the 343 lab servers. 344 @param delay_minutes: Delay the creation of test jobs for a given number 345 of minutes. 346 @param job_keyvals: General job keyvals to be inserted into keyval file 347 @param test_args: A dict of args passed all the way to each individual 348 test that will be actually ran. 349 @param **dargs: these arguments will be ignored. This allows us to 350 deprecate and remove arguments in ToT while not 351 breaking branch builds. 352 """ 353 self._check_init_params( 354 board=board, 355 builds=builds, 356 name=name, 357 job=job, 358 devserver_url=devserver_url) 359 360 self.board = 'board:%s' % board 361 self.builds = builds 362 self.name = name 363 self.job = job 364 self.pool = ('pool:%s' % pool) if pool else pool 365 self.num = num 366 self.check_hosts = check_hosts 367 self.skip_reimage = skip_reimage 368 self.add_experimental = add_experimental 369 self.file_bugs = file_bugs 370 self.file_experimental_bugs = file_experimental_bugs 371 self.dependencies = {'': []} 372 self.max_runtime_mins = max_runtime_mins 373 self.timeout = timeout 374 self.timeout_mins = timeout_mins or timeout * 60 375 self.bug_template = {} if bug_template is None else bug_template 376 self.priority = priority 377 self.wait_for_results = wait_for_results 378 self.job_retry = job_retry 379 self.max_retries = max_retries 380 self.offload_failures_only = offload_failures_only 381 self.run_prod_code = run_prod_code 382 self.delay_minutes = delay_minutes 383 self.job_keyvals = job_keyvals 384 self.test_args = test_args 385 386 self._init_predicate(predicate) 387 self._init_suite_dependencies(suite_dependencies) 388 self._init_devserver(devserver_url) 389 self._init_test_source_build(test_source_build) 390 self._translate_builds() 391 self._add_builds_to_suite_deps() 392 393 def _check_init_params(self, **kwargs): 394 for key, expected_type in self._REQUIRED_KEYWORDS.iteritems(): 395 value = kwargs.get(key) 396 # TODO(ayatane): `not value` includes both the cases where value is 397 # None and where value is the correct type, but empty (e.g., empty 398 # dict). It looks like this is NOT the intended behavior, but I'm 399 # hesitant to remove it in case something is actually relying on 400 # this behavior. 401 if not value or not isinstance(value, expected_type): 402 raise error.SuiteArgumentException( 403 'reimage_and_run() needs %s=<%r>' 404 % (key, expected_type)) 405 406 def _init_predicate(self, predicate): 407 """Initialize predicate attribute.""" 408 if predicate is None: 409 self.predicate = Suite.name_in_tag_predicate(self.name) 410 else: 411 self.predicate = predicate 412 413 414 def _init_suite_dependencies(self, suite_dependencies): 415 """Initialize suite dependencies attribute.""" 416 if suite_dependencies is None: 417 self.suite_dependencies = [] 418 elif isinstance(suite_dependencies, str): 419 self.suite_dependencies = [dep.strip(' ') for dep 420 in suite_dependencies.split(',')] 421 else: 422 self.suite_dependencies = suite_dependencies 423 424 def _init_devserver(self, devserver_url): 425 """Initialize devserver attribute.""" 426 if provision.ANDROID_BUILD_VERSION_PREFIX in self.builds: 427 self.devserver = dev_server.AndroidBuildServer(devserver_url) 428 else: 429 self.devserver = dev_server.ImageServer(devserver_url) 430 431 def _init_test_source_build(self, test_source_build): 432 """Initialize test_source_build attribute.""" 433 if test_source_build: 434 test_source_build = self.devserver.translate(test_source_build) 435 436 self.test_source_build = Suite.get_test_source_build( 437 self.builds, test_source_build=test_source_build) 438 439 def _translate_builds(self): 440 """Translate build names if they are in LATEST format.""" 441 for prefix in self._VERSION_PREFIXES: 442 if prefix in self.builds: 443 translated_build = self.devserver.translate( 444 self.builds[prefix]) 445 self.builds[prefix] = translated_build 446 447 def _add_builds_to_suite_deps(self): 448 """Add builds to suite_dependencies. 449 450 To support provision both CrOS and firmware, option builds are added to 451 SuiteSpec, e.g., 452 453 builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0', 454 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 455 456 version_prefix+build should make it into each test as a DEPENDENCY. 457 The easiest way to do this is to tack it onto the suite_dependencies. 458 """ 459 self.suite_dependencies.extend( 460 provision.join(version_prefix, build) 461 for version_prefix, build in self.builds.iteritems() 462 ) 463 464 465 def skip_reimage(g): 466 """ 467 Pulls the SKIP_IMAGE value out of a global variables dictionary. 468 @param g: The global variables dictionary. 469 @return: Value associated with SKIP-IMAGE 470 """ 471 return False 472 473 474 def reimage_and_run(**dargs): 475 """ 476 Backward-compatible API for dynamic_suite. 477 478 Will re-image a number of devices (of the specified board) with the 479 provided builds, and then run the indicated test suite on them. 480 Guaranteed to be compatible with any build from stable to dev. 481 482 @param dargs: Dictionary containing the arguments listed below. 483 484 Currently required args: 485 @param board: which kind of devices to reimage. 486 @param name: a value of the SUITE control file variable to search for. 487 @param job: an instance of client.common_lib.base_job representing the 488 currently running suite job. 489 490 Currently supported optional args: 491 @param builds: the builds to install e.g. 492 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 493 'fw-version:': 'x86-alex-firmware/R36-5771.50.0'} 494 @param pool: specify the pool of machines to use for scheduling purposes. 495 Default: None 496 @param num: the maximum number of devices to reimage. 497 Default in global_config 498 @param check_hosts: require appropriate hosts to be available now. 499 @param add_experimental: schedule experimental tests as well, or not. 500 Default: True 501 @param file_bugs: automatically file bugs on test failures. 502 Default: False 503 @param suite_dependencies: A string with a comma separated list of suite 504 level dependencies, which act just like test 505 dependencies and are appended to each test's 506 set of dependencies at job creation time. 507 @param devserver_url: url to the selected devserver. 508 @param predicate: Optional argument. If present, should be a function 509 mapping ControlData objects to True if they should be 510 included in suite. If argument is absent, suite 511 behavior will default to creating a suite of based 512 on the SUITE field of control files. 513 @param job_retry: A bool value indicating whether jobs should be retired 514 on failure. If True, the field 'JOB_RETRIES' in control 515 files will be respected. If False, do not retry. 516 @param max_retries: Maximum retry limit at suite level. 517 Regardless how many times each individual test 518 has been retried, the total number of retries 519 happening in the suite can't exceed _max_retries. 520 Default to None, no max. 521 @param offload_failures_only: Only enable gs_offloading for failed jobs. 522 @param test_args: A dict of args passed all the way to each individual test 523 that will be actually ran. 524 @raises AsynchronousBuildFailure: if there was an issue finishing staging 525 from the devserver. 526 @raises MalformedDependenciesException: if the dependency_info file for 527 the required build fails to parse. 528 """ 529 suite_spec = SuiteSpec(**dargs) 530 531 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 532 user=suite_spec.job.user, debug=False) 533 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 534 user=suite_spec.job.user, debug=False) 535 536 try: 537 my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag)) 538 logging.debug('Determined own job id: %d', my_job_id) 539 except ValueError: 540 my_job_id = None 541 logging.warning('Could not determine own job id.') 542 543 _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id) 544 545 logging.debug('Returning from dynamic_suite.reimage_and_run.') 546 547 548 def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None): 549 """ 550 Do the work of reimaging hosts and running tests. 551 552 @param spec: a populated SuiteSpec object. 553 @param afe: an instance of AFE as defined in server/frontend.py. 554 @param tko: an instance of TKO as defined in server/frontend.py. 555 @param suite_job_id: Job id that will act as parent id to all sub jobs. 556 Default: None 557 """ 558 # We can't do anything else until the devserver has finished downloading 559 # control_files and test_suites packages so that we can get the control 560 # files we should schedule. 561 if not spec.run_prod_code: 562 _stage_artifacts(spec) 563 564 timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT) 565 utils.write_keyval( 566 spec.job.resultdir, 567 {constants.ARTIFACT_FINISHED_TIME: timestamp}) 568 569 suite = Suite.create_from_predicates( 570 predicates=[spec.predicate], 571 name=spec.name, 572 builds=spec.builds, 573 board=spec.board, 574 devserver=spec.devserver, 575 afe=afe, 576 tko=tko, 577 pool=spec.pool, 578 results_dir=spec.job.resultdir, 579 max_runtime_mins=spec.max_runtime_mins, 580 timeout_mins=spec.timeout_mins, 581 file_bugs=spec.file_bugs, 582 file_experimental_bugs=spec.file_experimental_bugs, 583 suite_job_id=suite_job_id, 584 extra_deps=spec.suite_dependencies, 585 priority=spec.priority, 586 wait_for_results=spec.wait_for_results, 587 job_retry=spec.job_retry, 588 max_retries=spec.max_retries, 589 offload_failures_only=spec.offload_failures_only, 590 test_source_build=spec.test_source_build, 591 run_prod_code=spec.run_prod_code, 592 job_keyvals=spec.job_keyvals, 593 test_args=spec.test_args) 594 595 if spec.delay_minutes: 596 logging.debug('delay_minutes is set. Sleeping %d minutes before ' 597 'creating test jobs.', spec.delay_minutes) 598 time.sleep(spec.delay_minutes*60) 599 logging.debug('Finished waiting for %d minutes before creating test ' 600 'jobs.', spec.delay_minutes) 601 602 # Now we get to asychronously schedule tests. 603 suite.schedule(spec.job.record_entry, spec.add_experimental) 604 605 if suite.wait_for_results: 606 logging.debug('Waiting on suite.') 607 suite.wait(spec.job.record_entry, spec.bug_template) 608 logging.debug('Finished waiting on suite. ' 609 'Returning from _perform_reimage_and_run.') 610 else: 611 logging.info('wait_for_results is set to False, suite job will exit ' 612 'without waiting for test jobs to finish.') 613 614 615 def _stage_artifacts(suite_spec): 616 """Stage artifacts for a suite job. 617 618 @param suite_spec: a populated SuiteSpec object. 619 """ 620 try: 621 suite_spec.devserver.stage_artifacts( 622 image=suite_spec.test_source_build, 623 artifacts=['control_files', 'test_suites']) 624 except dev_server.DevServerException as e: 625 # If we can't get the control files, there's nothing to run. 626 raise error.AsynchronousBuildFailure(e) 627