1 #!/usr/bin/python 2 # 3 # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 4 # Use of this source code is governed by a BSD-style license that can be 5 # found in the LICENSE file. 6 7 """Tool to validate code in prod branch before pushing to lab. 8 9 The script runs push_to_prod suite to verify code in prod branch is ready to be 10 pushed. Link to design document: 11 https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit 12 13 To verify if prod branch can be pushed to lab, run following command in 14 chromeos-autotest.cbf server: 15 /usr/local/autotest/site_utils/test_push.py -e someone (at] company.com 16 17 The script uses latest stumpy canary build as test build by default. 18 19 """ 20 21 import argparse 22 import getpass 23 import multiprocessing 24 import os 25 import re 26 import subprocess 27 import sys 28 import time 29 import traceback 30 import urllib2 31 32 import common 33 try: 34 from autotest_lib.frontend import setup_django_environment 35 from autotest_lib.frontend.afe import models 36 except ImportError: 37 # Unittest may not have Django database configured and will fail to import. 38 pass 39 from autotest_lib.client.common_lib import global_config 40 from autotest_lib.server import site_utils 41 from autotest_lib.server.cros import provision 42 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 43 from autotest_lib.server.cros.dynamic_suite import reporting 44 from autotest_lib.server.hosts import factory 45 from autotest_lib.site_utils import gmail_lib 46 from autotest_lib.site_utils.suite_scheduler import constants 47 48 CONFIG = global_config.global_config 49 50 AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2) 51 52 MAIL_FROM = 'chromeos-test (at] google.com' 53 DEVSERVERS = CONFIG.get_config_value('CROS', 'dev_server', type=list, 54 default=[]) 55 BUILD_REGEX = '^R[\d]+-[\d]+\.[\d]+\.[\d]+$' 56 RUN_SUITE_COMMAND = 'run_suite.py' 57 PUSH_TO_PROD_SUITE = 'push_to_prod' 58 DUMMY_SUITE = 'dummy' 59 AU_SUITE = 'paygen_au_canary' 60 61 SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*' 62 'tab_id=view_job&object_id=(\d+)$') 63 64 # Dictionary of test results keyed by test name regular expression. 65 EXPECTED_TEST_RESULTS = {'^SERVER_JOB$': 'GOOD', 66 # This is related to dummy_Fail/control.dependency. 67 'dummy_Fail.dependency$': 'TEST_NA', 68 'login_LoginSuccess.*': 'GOOD', 69 'platform_InstallTestImage_SERVER_JOB$': 'GOOD', 70 'provision_AutoUpdate.double': 'GOOD', 71 'dummy_Pass.*': 'GOOD', 72 'dummy_Fail.Fail$': 'FAIL', 73 'dummy_Fail.RetryFail$': 'FAIL', 74 'dummy_Fail.RetrySuccess': 'GOOD', 75 'dummy_Fail.Error$': 'ERROR', 76 'dummy_Fail.Warn$': 'WARN', 77 'dummy_Fail.NAError$': 'TEST_NA', 78 'dummy_Fail.Crash$': 'GOOD', 79 } 80 81 EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$': 'GOOD', 82 'dummy_Pass.*': 'GOOD', 83 'dummy_Fail.Fail': 'FAIL', 84 'dummy_Fail.Warn': 'WARN', 85 'dummy_Fail.Crash': 'GOOD', 86 'dummy_Fail.Error': 'ERROR', 87 'dummy_Fail.NAError': 'TEST_NA',} 88 89 EXPECTED_TEST_RESULTS_AU = {'SERVER_JOB$': 'GOOD', 90 'autoupdate_EndToEndTest.paygen_au_canary_delta.*': 'GOOD', 91 'autoupdate_EndToEndTest.paygen_au_canary_full.*': 'GOOD', 92 } 93 94 # Anchor for the auto-filed bug for dummy_Fail tests. 95 BUG_ANCHOR = 'TestFailure(push_to_prod,dummy_Fail.Fail,always fail)' 96 97 URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str) 98 URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str) 99 100 # Some test could be missing from the test results for various reasons. Add 101 # such test in this list and explain the reason. 102 IGNORE_MISSING_TESTS = [ 103 # For latest build, npo_test_delta does not exist. 104 'autoupdate_EndToEndTest.npo_test_delta.*', 105 # For trybot build, nmo_test_delta does not exist. 106 'autoupdate_EndToEndTest.nmo_test_delta.*', 107 # Older build does not have login_LoginSuccess test in push_to_prod suite. 108 # TODO(dshi): Remove following lines after R41 is stable. 109 'login_LoginSuccess'] 110 111 # Save all run_suite command output. 112 run_suite_output = [] 113 114 class TestPushException(Exception): 115 """Exception to be raised when the test to push to prod failed.""" 116 pass 117 118 119 def powerwash_dut(hostname): 120 """Powerwash the dut with the given hostname. 121 122 @param hostname: hostname of the dut. 123 """ 124 host = factory.create_host(hostname) 125 host.run('echo "fast safe" > ' 126 '/mnt/stateful_partition/factory_install_reset') 127 host.run('reboot') 128 host.close() 129 130 131 def get_default_build(devserver=None, board='stumpy'): 132 """Get the default build to be used for test. 133 134 @param devserver: devserver used to look for latest staged build. If value 135 is None, all devservers in config will be tried. 136 @param board: Name of board to be tested, default is stumpy. 137 @return: Build to be tested, e.g., stumpy-release/R36-5881.0.0 138 """ 139 LATEST_BUILD_URL_PATTERN = '%s/latestbuild?target=%s-release' 140 build = None 141 if not devserver: 142 for server in DEVSERVERS: 143 url = LATEST_BUILD_URL_PATTERN % (server, board) 144 build = urllib2.urlopen(url).read() 145 if build and re.match(BUILD_REGEX, build): 146 return '%s-release/%s' % (board, build) 147 148 # If no devserver has any build staged for the given board, use the stable 149 # build in config. 150 build = CONFIG.get_config_value('CROS', 'stable_cros_version') 151 return '%s-release/%s' % (board, build) 152 153 154 def parse_arguments(): 155 """Parse arguments for test_push tool. 156 157 @return: Parsed arguments. 158 159 """ 160 parser = argparse.ArgumentParser() 161 parser.add_argument('-b', '--board', dest='board', default='stumpy', 162 help='Default is stumpy.') 163 parser.add_argument('-sb', '--shard_board', dest='shard_board', 164 default='quawks', 165 help='Default is quawks.') 166 parser.add_argument('-i', '--build', dest='build', default=None, 167 help='Default is the latest canary build of given ' 168 'board. Must be a canary build, otherwise AU test ' 169 'will fail.') 170 parser.add_argument('-si', '--shard_build', dest='shard_build', default=None, 171 help='Default is the latest canary build of given ' 172 'board. Must be a canary build, otherwise AU test ' 173 'will fail.') 174 parser.add_argument('-p', '--pool', dest='pool', default='bvt') 175 parser.add_argument('-u', '--num', dest='num', type=int, default=3, 176 help='Run on at most NUM machines.') 177 parser.add_argument('-f', '--file_bugs', dest='file_bugs', default='True', 178 help='File bugs on test failures. Must pass "True" or ' 179 '"False" if used.') 180 parser.add_argument('-e', '--email', dest='email', default=None, 181 help='Email address for the notification to be sent to ' 182 'after the script finished running.') 183 parser.add_argument('-d', '--devserver', dest='devserver', 184 default=None, 185 help='devserver to find what\'s the latest build.') 186 parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int, 187 default=24, 188 help='Time in mins to wait before abort the jobs we ' 189 'are waiting on. Only for the asynchronous suites ' 190 'triggered by create_and_return flag.') 191 192 arguments = parser.parse_args(sys.argv[1:]) 193 194 # Get latest canary build as default build. 195 if not arguments.build: 196 arguments.build = get_default_build(arguments.devserver, 197 arguments.board) 198 if not arguments.shard_build: 199 arguments.shard_build = get_default_build(arguments.devserver, 200 arguments.shard_board) 201 202 return arguments 203 204 205 def do_run_suite(suite_name, arguments, use_shard=False, 206 create_and_return=False): 207 """Call run_suite to run a suite job, and return the suite job id. 208 209 The script waits the suite job to finish before returning the suite job id. 210 Also it will echo the run_suite output to stdout. 211 212 @param suite_name: Name of a suite, e.g., dummy. 213 @param arguments: Arguments for run_suite command. 214 @param use_shard: If true, suite is scheduled for shard board. 215 @param create_and_return: If True, run_suite just creates the suite, print 216 the job id, then finish immediately. 217 218 @return: Suite job ID. 219 220 """ 221 if not use_shard: 222 board = arguments.board 223 build = arguments.build 224 else: 225 board = arguments.shard_board 226 build = arguments.shard_build 227 228 # Remove cros-version label to force provision. 229 hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board) 230 for host in hosts: 231 for label in [l for l in host.labels 232 if l.startswith(provision.CROS_VERSION_PREFIX)]: 233 AFE.run('host_remove_labels', id=host.id, labels=[label]) 234 235 if use_shard and not create_and_return: 236 # Let's verify the repair flow and powerwash the duts. We can 237 # assume they're all cros hosts (valid assumption?) so powerwash 238 # will work. 239 try: 240 powerwash_dut(host.hostname) 241 except Exception as e: 242 raise TestPushException('Failed to powerwash dut %s. Make ' 243 'sure the dut is working first. ' 244 'Error: %s' % (host.hostname, e)) 245 AFE.reverify_hosts(hostnames=[host.hostname]) 246 247 current_dir = os.path.dirname(os.path.realpath(__file__)) 248 cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND), 249 '-s', suite_name, 250 '-b', board, 251 '-i', build, 252 '-p', arguments.pool, 253 '-u', str(arguments.num), 254 '-f', arguments.file_bugs] 255 if create_and_return: 256 cmd += ['-c'] 257 258 suite_job_id = None 259 260 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 261 stderr=subprocess.STDOUT) 262 263 while True: 264 line = proc.stdout.readline() 265 266 # Break when run_suite process completed. 267 if not line and proc.poll() != None: 268 break 269 print line.rstrip() 270 run_suite_output.append(line.rstrip()) 271 272 if not suite_job_id: 273 m = re.match(SUITE_JOB_START_INFO_REGEX, line) 274 if m and m.group(1): 275 suite_job_id = int(m.group(1)) 276 277 if not suite_job_id: 278 raise TestPushException('Failed to retrieve suite job ID.') 279 280 # If create_and_return specified, wait for the suite to finish. 281 if create_and_return: 282 end = time.time() + arguments.timeout_min * 60 283 while not AFE.get_jobs(id=suite_job_id, finished=True): 284 if time.time() < end: 285 time.sleep(10) 286 else: 287 AFE.run('abort_host_queue_entries', job=suite_job_id) 288 raise TestPushException( 289 'Asynchronous suite triggered by create_and_return ' 290 'flag has timed out after %d mins. Aborting it.' % 291 arguments.timeout_min) 292 293 print 'Suite job %s is completed.' % suite_job_id 294 return suite_job_id 295 296 297 def check_dut_image(build, suite_job_id): 298 """Confirm all DUTs used for the suite are imaged to expected build. 299 300 @param build: Expected build to be imaged. 301 @param suite_job_id: job ID of the suite job. 302 @raise TestPushException: If a DUT does not have expected build imaged. 303 """ 304 print 'Checking image installed in DUTs...' 305 job_ids = [job.id for job in 306 models.Job.objects.filter(parent_job_id=suite_job_id)] 307 hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0] 308 for job_id in job_ids] 309 hostnames = set([hqe.host.hostname for hqe in hqes]) 310 for hostname in hostnames: 311 found_build = site_utils.get_build_from_afe(hostname, AFE) 312 if found_build != build: 313 raise TestPushException('DUT is not imaged properly. Host %s has ' 314 'build %s, while build %s is expected.' % 315 (hostname, found_build, build)) 316 317 318 def test_suite(suite_name, expected_results, arguments, use_shard=False, 319 create_and_return=False): 320 """Call run_suite to start a suite job and verify results. 321 322 @param suite_name: Name of a suite, e.g., dummy 323 @param expected_results: A dictionary of test name to test result. 324 @param arguments: Arguments for run_suite command. 325 @param use_shard: If true, suite is scheduled for shard board. 326 @param create_and_return: If True, run_suite just creates the suite, print 327 the job id, then finish immediately. 328 """ 329 suite_job_id = do_run_suite(suite_name, arguments, use_shard, 330 create_and_return) 331 332 # Confirm all DUTs used for the suite are imaged to expected build. 333 # hqe.host_id for jobs running in shard is not synced back to master db, 334 # therefore, skip verifying dut build for jobs running in shard. 335 if suite_name != AU_SUITE and not use_shard: 336 check_dut_image(arguments.build, suite_job_id) 337 338 # Find all tests and their status 339 print 'Comparing test results...' 340 TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10) 341 test_views = site_utils.get_test_views_from_tko(suite_job_id, TKO) 342 343 mismatch_errors = [] 344 extra_test_errors = [] 345 346 found_keys = set() 347 for test_name,test_status in test_views.items(): 348 print "%s%s" % (test_name.ljust(30), test_status) 349 test_found = False 350 for key,val in expected_results.items(): 351 if re.search(key, test_name): 352 test_found = True 353 found_keys.add(key) 354 # TODO(dshi): result for this test is ignored until servo is 355 # added to a host accessible by cbf server (crbug.com/277109). 356 if key == 'platform_InstallTestImage_SERVER_JOB$': 357 continue 358 if val != test_status: 359 error = ('%s Expected: [%s], Actual: [%s]' % 360 (test_name, val, test_status)) 361 mismatch_errors.append(error) 362 if not test_found: 363 extra_test_errors.append(test_name) 364 365 missing_test_errors = set(expected_results.keys()) - found_keys 366 for exception in IGNORE_MISSING_TESTS: 367 try: 368 missing_test_errors.remove(exception) 369 except KeyError: 370 pass 371 372 summary = [] 373 if mismatch_errors: 374 summary.append(('Results of %d test(s) do not match expected ' 375 'values:') % len(mismatch_errors)) 376 summary.extend(mismatch_errors) 377 summary.append('\n') 378 379 if extra_test_errors: 380 summary.append('%d test(s) are not expected to be run:' % 381 len(extra_test_errors)) 382 summary.extend(extra_test_errors) 383 summary.append('\n') 384 385 if missing_test_errors: 386 summary.append('%d test(s) are missing from the results:' % 387 len(missing_test_errors)) 388 summary.extend(missing_test_errors) 389 summary.append('\n') 390 391 # Test link to log can be loaded. 392 job_name = '%s-%s' % (suite_job_id, getpass.getuser()) 393 log_link = URL_PATTERN % (URL_HOST, job_name) 394 try: 395 urllib2.urlopen(log_link).read() 396 except urllib2.URLError: 397 summary.append('Failed to load page for link to log: %s.' % log_link) 398 399 if summary: 400 raise TestPushException('\n'.join(summary)) 401 402 403 def test_suite_wrapper(queue, suite_name, expected_results, arguments, 404 use_shard=False, create_and_return=False): 405 """Wrapper to call test_suite. Handle exception and pipe it to parent 406 process. 407 408 @param queue: Queue to save exception to be accessed by parent process. 409 @param suite_name: Name of a suite, e.g., dummy 410 @param expected_results: A dictionary of test name to test result. 411 @param arguments: Arguments for run_suite command. 412 @param use_shard: If true, suite is scheduled for shard board. 413 @param create_and_return: If True, run_suite just creates the suite, print 414 the job id, then finish immediately. 415 """ 416 try: 417 test_suite(suite_name, expected_results, arguments, use_shard, 418 create_and_return) 419 except: 420 # Store the whole exc_info leads to a PicklingError. 421 except_type, except_value, tb = sys.exc_info() 422 queue.put((except_type, except_value, traceback.extract_tb(tb))) 423 424 425 def close_bug(): 426 """Close all existing bugs filed for dummy_Fail. 427 428 @return: A list of issue ids to be used in check_bug_filed_and_deduped. 429 """ 430 old_issue_ids = [] 431 reporter = reporting.Reporter() 432 while True: 433 issue = reporter.find_issue_by_marker(BUG_ANCHOR) 434 if not issue: 435 return old_issue_ids 436 if issue.id in old_issue_ids: 437 raise TestPushException('Failed to close issue %d' % issue.id) 438 old_issue_ids.append(issue.id) 439 reporter.modify_bug_report(issue.id, 440 comment='Issue closed by test_push script.', 441 label_update='', 442 status='WontFix') 443 444 445 def check_bug_filed_and_deduped(old_issue_ids): 446 """Confirm bug related to dummy_Fail was filed and deduped. 447 448 @param old_issue_ids: A list of issue ids that was closed earlier. id of the 449 new issue must be not in this list. 450 @raise TestPushException: If auto bug file failed to create a new issue or 451 dedupe multiple failures. 452 """ 453 reporter = reporting.Reporter() 454 issue = reporter.find_issue_by_marker(BUG_ANCHOR) 455 if not issue: 456 raise TestPushException('Auto bug file failed. Unable to locate bug ' 457 'with marker %s' % BUG_ANCHOR) 458 if old_issue_ids and issue.id in old_issue_ids: 459 raise TestPushException('Auto bug file failed to create a new issue. ' 460 'id of the old issue found is %d.' % issue.id) 461 if not ('%s2' % reporter.AUTOFILED_COUNT) in issue.labels: 462 raise TestPushException(('Auto bug file failed to dedupe for issue %d ' 463 'with labels of %s.') % 464 (issue.id, issue.labels)) 465 # Close the bug, and do the search again, which should return None. 466 reporter.modify_bug_report(issue.id, 467 comment='Issue closed by test_push script.', 468 label_update='', 469 status='WontFix') 470 second_issue = reporter.find_issue_by_marker(BUG_ANCHOR) 471 if second_issue: 472 ids = '%d, %d' % (issue.id, second_issue.id) 473 raise TestPushException(('Auto bug file failed. Multiple issues (%s) ' 474 'filed with marker %s') % (ids, BUG_ANCHOR)) 475 print 'Issue %d was filed and deduped successfully.' % issue.id 476 477 478 def check_queue(queue): 479 """Check the queue for any exception being raised. 480 481 @param queue: Queue used to store exception for parent process to access. 482 @raise: Any exception found in the queue. 483 """ 484 if queue.empty(): 485 return 486 exc_info = queue.get() 487 # Raise the exception with original backtrace. 488 print 'Original stack trace of the exception:\n%s' % exc_info[2] 489 raise exc_info[0](exc_info[1]) 490 491 492 def main(): 493 """Entry point for test_push script.""" 494 arguments = parse_arguments() 495 496 try: 497 # Close existing bugs. New bug should be filed in dummy_Fail test. 498 old_issue_ids = close_bug() 499 500 queue = multiprocessing.Queue() 501 502 push_to_prod_suite = multiprocessing.Process( 503 target=test_suite_wrapper, 504 args=(queue, PUSH_TO_PROD_SUITE, EXPECTED_TEST_RESULTS, 505 arguments)) 506 push_to_prod_suite.start() 507 508 # TODO(dshi): Remove following line after crbug.com/267644 is fixed. 509 # Also, merge EXPECTED_TEST_RESULTS_AU to EXPECTED_TEST_RESULTS 510 au_suite = multiprocessing.Process( 511 target=test_suite_wrapper, 512 args=(queue, AU_SUITE, EXPECTED_TEST_RESULTS_AU, 513 arguments)) 514 au_suite.start() 515 516 shard_suite = multiprocessing.Process( 517 target=test_suite_wrapper, 518 args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY, 519 arguments, True)) 520 shard_suite.start() 521 522 # suite test with --create_and_return flag 523 asynchronous_suite = multiprocessing.Process( 524 target=test_suite_wrapper, 525 args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY, 526 arguments, True, True)) 527 asynchronous_suite.start() 528 529 bug_filing_checked = False 530 while (push_to_prod_suite.is_alive() or au_suite.is_alive() or 531 shard_suite.is_alive() or asynchronous_suite.is_alive()): 532 check_queue(queue) 533 # Check bug filing results to fail early if bug filing failed. 534 if not bug_filing_checked and not push_to_prod_suite.is_alive(): 535 check_bug_filed_and_deduped(old_issue_ids) 536 bug_filing_checked = True 537 time.sleep(5) 538 539 check_queue(queue) 540 541 push_to_prod_suite.join() 542 au_suite.join() 543 shard_suite.join() 544 asynchronous_suite.join() 545 except Exception as e: 546 print 'Test for pushing to prod failed:\n' 547 print str(e) 548 # Send out email about the test failure. 549 if arguments.email: 550 gmail_lib.send_email( 551 arguments.email, 552 'Test for pushing to prod failed. Do NOT push!', 553 ('Errors occurred during the test:\n\n%s\n\n' % str(e) + 554 'run_suite output:\n\n%s' % '\n'.join(run_suite_output))) 555 raise 556 557 message = ('\nAll tests are completed successfully, prod branch is ready to' 558 ' be pushed.') 559 print message 560 # Send out email about test completed successfully. 561 if arguments.email: 562 gmail_lib.send_email( 563 arguments.email, 564 'Test for pushing to prod completed successfully', 565 message) 566 567 568 if __name__ == '__main__': 569 sys.exit(main()) 570