Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 """Tool to validate code in prod branch before pushing to lab.
      8 
      9 The script runs push_to_prod suite to verify code in prod branch is ready to be
     10 pushed. Link to design document:
     11 https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit
     12 
     13 To verify if prod branch can be pushed to lab, run following command in
     14 chromeos-autotest.cbf server:
     15 /usr/local/autotest/site_utils/test_push.py -e someone (at] company.com
     16 
     17 The script uses latest stumpy canary build as test build by default.
     18 
     19 """
     20 
     21 import argparse
     22 import getpass
     23 import multiprocessing
     24 import os
     25 import re
     26 import subprocess
     27 import sys
     28 import time
     29 import traceback
     30 import urllib2
     31 
     32 import common
     33 try:
     34     from autotest_lib.frontend import setup_django_environment
     35     from autotest_lib.frontend.afe import models
     36 except ImportError:
     37     # Unittest may not have Django database configured and will fail to import.
     38     pass
     39 from autotest_lib.client.common_lib import global_config
     40 from autotest_lib.server import site_utils
     41 from autotest_lib.server.cros import provision
     42 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     43 from autotest_lib.server.cros.dynamic_suite import reporting
     44 from autotest_lib.server.hosts import factory
     45 from autotest_lib.site_utils import gmail_lib
     46 from autotest_lib.site_utils.suite_scheduler import constants
     47 
     48 CONFIG = global_config.global_config
     49 
     50 AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)
     51 
     52 MAIL_FROM = 'chromeos-test (at] google.com'
     53 DEVSERVERS = CONFIG.get_config_value('CROS', 'dev_server', type=list,
     54                                      default=[])
     55 BUILD_REGEX = '^R[\d]+-[\d]+\.[\d]+\.[\d]+$'
     56 RUN_SUITE_COMMAND = 'run_suite.py'
     57 PUSH_TO_PROD_SUITE = 'push_to_prod'
     58 DUMMY_SUITE = 'dummy'
     59 AU_SUITE = 'paygen_au_canary'
     60 
     61 SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'
     62                               'tab_id=view_job&object_id=(\d+)$')
     63 
     64 # Dictionary of test results keyed by test name regular expression.
     65 EXPECTED_TEST_RESULTS = {'^SERVER_JOB$':                 'GOOD',
     66                          # This is related to dummy_Fail/control.dependency.
     67                          'dummy_Fail.dependency$':       'TEST_NA',
     68                          'login_LoginSuccess.*':         'GOOD',
     69                          'platform_InstallTestImage_SERVER_JOB$': 'GOOD',
     70                          'provision_AutoUpdate.double':  'GOOD',
     71                          'dummy_Pass.*':                 'GOOD',
     72                          'dummy_Fail.Fail$':             'FAIL',
     73                          'dummy_Fail.RetryFail$':        'FAIL',
     74                          'dummy_Fail.RetrySuccess':      'GOOD',
     75                          'dummy_Fail.Error$':            'ERROR',
     76                          'dummy_Fail.Warn$':             'WARN',
     77                          'dummy_Fail.NAError$':          'TEST_NA',
     78                          'dummy_Fail.Crash$':            'GOOD',
     79                          }
     80 
     81 EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$':       'GOOD',
     82                                'dummy_Pass.*':       'GOOD',
     83                                'dummy_Fail.Fail':    'FAIL',
     84                                'dummy_Fail.Warn':    'WARN',
     85                                'dummy_Fail.Crash':   'GOOD',
     86                                'dummy_Fail.Error':   'ERROR',
     87                                'dummy_Fail.NAError': 'TEST_NA',}
     88 
     89 EXPECTED_TEST_RESULTS_AU = {'SERVER_JOB$':                        'GOOD',
     90          'autoupdate_EndToEndTest.paygen_au_canary_delta.*': 'GOOD',
     91          'autoupdate_EndToEndTest.paygen_au_canary_full.*':  'GOOD',
     92          }
     93 
     94 # Anchor for the auto-filed bug for dummy_Fail tests.
     95 BUG_ANCHOR = 'TestFailure(push_to_prod,dummy_Fail.Fail,always fail)'
     96 
     97 URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)
     98 URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
     99 
    100 # Some test could be missing from the test results for various reasons. Add
    101 # such test in this list and explain the reason.
    102 IGNORE_MISSING_TESTS = [
    103     # For latest build, npo_test_delta does not exist.
    104     'autoupdate_EndToEndTest.npo_test_delta.*',
    105     # For trybot build, nmo_test_delta does not exist.
    106     'autoupdate_EndToEndTest.nmo_test_delta.*',
    107     # Older build does not have login_LoginSuccess test in push_to_prod suite.
    108     # TODO(dshi): Remove following lines after R41 is stable.
    109     'login_LoginSuccess']
    110 
    111 # Save all run_suite command output.
    112 run_suite_output = []
    113 
    114 class TestPushException(Exception):
    115     """Exception to be raised when the test to push to prod failed."""
    116     pass
    117 
    118 
    119 def powerwash_dut(hostname):
    120     """Powerwash the dut with the given hostname.
    121 
    122     @param hostname: hostname of the dut.
    123     """
    124     host = factory.create_host(hostname)
    125     host.run('echo "fast safe" > '
    126              '/mnt/stateful_partition/factory_install_reset')
    127     host.run('reboot')
    128     host.close()
    129 
    130 
    131 def get_default_build(devserver=None, board='stumpy'):
    132     """Get the default build to be used for test.
    133 
    134     @param devserver: devserver used to look for latest staged build. If value
    135                       is None, all devservers in config will be tried.
    136     @param board: Name of board to be tested, default is stumpy.
    137     @return: Build to be tested, e.g., stumpy-release/R36-5881.0.0
    138     """
    139     LATEST_BUILD_URL_PATTERN = '%s/latestbuild?target=%s-release'
    140     build = None
    141     if not devserver:
    142         for server in DEVSERVERS:
    143             url = LATEST_BUILD_URL_PATTERN % (server, board)
    144             build = urllib2.urlopen(url).read()
    145             if build and re.match(BUILD_REGEX, build):
    146                 return '%s-release/%s' % (board, build)
    147 
    148     # If no devserver has any build staged for the given board, use the stable
    149     # build in config.
    150     build = CONFIG.get_config_value('CROS', 'stable_cros_version')
    151     return '%s-release/%s' % (board, build)
    152 
    153 
    154 def parse_arguments():
    155     """Parse arguments for test_push tool.
    156 
    157     @return: Parsed arguments.
    158 
    159     """
    160     parser = argparse.ArgumentParser()
    161     parser.add_argument('-b', '--board', dest='board', default='stumpy',
    162                         help='Default is stumpy.')
    163     parser.add_argument('-sb', '--shard_board', dest='shard_board',
    164                         default='quawks',
    165                         help='Default is quawks.')
    166     parser.add_argument('-i', '--build', dest='build', default=None,
    167                         help='Default is the latest canary build of given '
    168                              'board. Must be a canary build, otherwise AU test '
    169                              'will fail.')
    170     parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,
    171                         help='Default is the latest canary build of given '
    172                              'board. Must be a canary build, otherwise AU test '
    173                              'will fail.')
    174     parser.add_argument('-p', '--pool', dest='pool', default='bvt')
    175     parser.add_argument('-u', '--num', dest='num', type=int, default=3,
    176                         help='Run on at most NUM machines.')
    177     parser.add_argument('-f', '--file_bugs', dest='file_bugs', default='True',
    178                         help='File bugs on test failures. Must pass "True" or '
    179                              '"False" if used.')
    180     parser.add_argument('-e', '--email', dest='email', default=None,
    181                         help='Email address for the notification to be sent to '
    182                              'after the script finished running.')
    183     parser.add_argument('-d', '--devserver', dest='devserver',
    184                         default=None,
    185                         help='devserver to find what\'s the latest build.')
    186     parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,
    187                         default=24,
    188                         help='Time in mins to wait before abort the jobs we '
    189                              'are waiting on. Only for the asynchronous suites '
    190                              'triggered by create_and_return flag.')
    191 
    192     arguments = parser.parse_args(sys.argv[1:])
    193 
    194     # Get latest canary build as default build.
    195     if not arguments.build:
    196         arguments.build = get_default_build(arguments.devserver,
    197                                             arguments.board)
    198     if not arguments.shard_build:
    199         arguments.shard_build = get_default_build(arguments.devserver,
    200                                                   arguments.shard_board)
    201 
    202     return arguments
    203 
    204 
    205 def do_run_suite(suite_name, arguments, use_shard=False,
    206                  create_and_return=False):
    207     """Call run_suite to run a suite job, and return the suite job id.
    208 
    209     The script waits the suite job to finish before returning the suite job id.
    210     Also it will echo the run_suite output to stdout.
    211 
    212     @param suite_name: Name of a suite, e.g., dummy.
    213     @param arguments: Arguments for run_suite command.
    214     @param use_shard: If true, suite is scheduled for shard board.
    215     @param create_and_return: If True, run_suite just creates the suite, print
    216                               the job id, then finish immediately.
    217 
    218     @return: Suite job ID.
    219 
    220     """
    221     if not use_shard:
    222         board = arguments.board
    223         build = arguments.build
    224     else:
    225         board = arguments.shard_board
    226         build = arguments.shard_build
    227 
    228     # Remove cros-version label to force provision.
    229     hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board)
    230     for host in hosts:
    231         for label in [l for l in host.labels
    232                       if l.startswith(provision.CROS_VERSION_PREFIX)]:
    233             AFE.run('host_remove_labels', id=host.id, labels=[label])
    234 
    235         if use_shard and not create_and_return:
    236             # Let's verify the repair flow and powerwash the duts.  We can
    237             # assume they're all cros hosts (valid assumption?) so powerwash
    238             # will work.
    239             try:
    240                 powerwash_dut(host.hostname)
    241             except Exception as e:
    242                 raise TestPushException('Failed to powerwash dut %s. Make '
    243                                         'sure the dut is working first. '
    244                                         'Error: %s' % (host.hostname, e))
    245             AFE.reverify_hosts(hostnames=[host.hostname])
    246 
    247     current_dir = os.path.dirname(os.path.realpath(__file__))
    248     cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),
    249            '-s', suite_name,
    250            '-b', board,
    251            '-i', build,
    252            '-p', arguments.pool,
    253            '-u', str(arguments.num),
    254            '-f', arguments.file_bugs]
    255     if create_and_return:
    256         cmd += ['-c']
    257 
    258     suite_job_id = None
    259 
    260     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
    261                             stderr=subprocess.STDOUT)
    262 
    263     while True:
    264         line = proc.stdout.readline()
    265 
    266         # Break when run_suite process completed.
    267         if not line and proc.poll() != None:
    268             break
    269         print line.rstrip()
    270         run_suite_output.append(line.rstrip())
    271 
    272         if not suite_job_id:
    273             m = re.match(SUITE_JOB_START_INFO_REGEX, line)
    274             if m and m.group(1):
    275                 suite_job_id = int(m.group(1))
    276 
    277     if not suite_job_id:
    278         raise TestPushException('Failed to retrieve suite job ID.')
    279 
    280     # If create_and_return specified, wait for the suite to finish.
    281     if create_and_return:
    282         end = time.time() + arguments.timeout_min * 60
    283         while not AFE.get_jobs(id=suite_job_id, finished=True):
    284             if time.time() < end:
    285                 time.sleep(10)
    286             else:
    287                 AFE.run('abort_host_queue_entries', job=suite_job_id)
    288                 raise TestPushException(
    289                         'Asynchronous suite triggered by create_and_return '
    290                         'flag has timed out after %d mins. Aborting it.' %
    291                         arguments.timeout_min)
    292 
    293     print 'Suite job %s is completed.' % suite_job_id
    294     return suite_job_id
    295 
    296 
    297 def check_dut_image(build, suite_job_id):
    298     """Confirm all DUTs used for the suite are imaged to expected build.
    299 
    300     @param build: Expected build to be imaged.
    301     @param suite_job_id: job ID of the suite job.
    302     @raise TestPushException: If a DUT does not have expected build imaged.
    303     """
    304     print 'Checking image installed in DUTs...'
    305     job_ids = [job.id for job in
    306                models.Job.objects.filter(parent_job_id=suite_job_id)]
    307     hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]
    308             for job_id in job_ids]
    309     hostnames = set([hqe.host.hostname for hqe in hqes])
    310     for hostname in hostnames:
    311         found_build = site_utils.get_build_from_afe(hostname, AFE)
    312         if found_build != build:
    313             raise TestPushException('DUT is not imaged properly. Host %s has '
    314                                     'build %s, while build %s is expected.' %
    315                                     (hostname, found_build, build))
    316 
    317 
    318 def test_suite(suite_name, expected_results, arguments, use_shard=False,
    319                create_and_return=False):
    320     """Call run_suite to start a suite job and verify results.
    321 
    322     @param suite_name: Name of a suite, e.g., dummy
    323     @param expected_results: A dictionary of test name to test result.
    324     @param arguments: Arguments for run_suite command.
    325     @param use_shard: If true, suite is scheduled for shard board.
    326     @param create_and_return: If True, run_suite just creates the suite, print
    327                               the job id, then finish immediately.
    328     """
    329     suite_job_id = do_run_suite(suite_name, arguments, use_shard,
    330                                 create_and_return)
    331 
    332     # Confirm all DUTs used for the suite are imaged to expected build.
    333     # hqe.host_id for jobs running in shard is not synced back to master db,
    334     # therefore, skip verifying dut build for jobs running in shard.
    335     if suite_name != AU_SUITE and not use_shard:
    336         check_dut_image(arguments.build, suite_job_id)
    337 
    338     # Find all tests and their status
    339     print 'Comparing test results...'
    340     TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)
    341     test_views = site_utils.get_test_views_from_tko(suite_job_id, TKO)
    342 
    343     mismatch_errors = []
    344     extra_test_errors = []
    345 
    346     found_keys = set()
    347     for test_name,test_status in test_views.items():
    348         print "%s%s" % (test_name.ljust(30), test_status)
    349         test_found = False
    350         for key,val in expected_results.items():
    351             if re.search(key, test_name):
    352                 test_found = True
    353                 found_keys.add(key)
    354                 # TODO(dshi): result for this test is ignored until servo is
    355                 # added to a host accessible by cbf server (crbug.com/277109).
    356                 if key == 'platform_InstallTestImage_SERVER_JOB$':
    357                     continue
    358                 if val != test_status:
    359                     error = ('%s Expected: [%s], Actual: [%s]' %
    360                              (test_name, val, test_status))
    361                     mismatch_errors.append(error)
    362         if not test_found:
    363             extra_test_errors.append(test_name)
    364 
    365     missing_test_errors = set(expected_results.keys()) - found_keys
    366     for exception in IGNORE_MISSING_TESTS:
    367         try:
    368             missing_test_errors.remove(exception)
    369         except KeyError:
    370             pass
    371 
    372     summary = []
    373     if mismatch_errors:
    374         summary.append(('Results of %d test(s) do not match expected '
    375                         'values:') % len(mismatch_errors))
    376         summary.extend(mismatch_errors)
    377         summary.append('\n')
    378 
    379     if extra_test_errors:
    380         summary.append('%d test(s) are not expected to be run:' %
    381                        len(extra_test_errors))
    382         summary.extend(extra_test_errors)
    383         summary.append('\n')
    384 
    385     if missing_test_errors:
    386         summary.append('%d test(s) are missing from the results:' %
    387                        len(missing_test_errors))
    388         summary.extend(missing_test_errors)
    389         summary.append('\n')
    390 
    391     # Test link to log can be loaded.
    392     job_name = '%s-%s' % (suite_job_id, getpass.getuser())
    393     log_link = URL_PATTERN % (URL_HOST, job_name)
    394     try:
    395         urllib2.urlopen(log_link).read()
    396     except urllib2.URLError:
    397         summary.append('Failed to load page for link to log: %s.' % log_link)
    398 
    399     if summary:
    400         raise TestPushException('\n'.join(summary))
    401 
    402 
    403 def test_suite_wrapper(queue, suite_name, expected_results, arguments,
    404                        use_shard=False, create_and_return=False):
    405     """Wrapper to call test_suite. Handle exception and pipe it to parent
    406     process.
    407 
    408     @param queue: Queue to save exception to be accessed by parent process.
    409     @param suite_name: Name of a suite, e.g., dummy
    410     @param expected_results: A dictionary of test name to test result.
    411     @param arguments: Arguments for run_suite command.
    412     @param use_shard: If true, suite is scheduled for shard board.
    413     @param create_and_return: If True, run_suite just creates the suite, print
    414                               the job id, then finish immediately.
    415     """
    416     try:
    417         test_suite(suite_name, expected_results, arguments, use_shard,
    418                    create_and_return)
    419     except:
    420         # Store the whole exc_info leads to a PicklingError.
    421         except_type, except_value, tb = sys.exc_info()
    422         queue.put((except_type, except_value, traceback.extract_tb(tb)))
    423 
    424 
    425 def close_bug():
    426     """Close all existing bugs filed for dummy_Fail.
    427 
    428     @return: A list of issue ids to be used in check_bug_filed_and_deduped.
    429     """
    430     old_issue_ids = []
    431     reporter = reporting.Reporter()
    432     while True:
    433         issue = reporter.find_issue_by_marker(BUG_ANCHOR)
    434         if not issue:
    435             return old_issue_ids
    436         if issue.id in old_issue_ids:
    437             raise TestPushException('Failed to close issue %d' % issue.id)
    438         old_issue_ids.append(issue.id)
    439         reporter.modify_bug_report(issue.id,
    440                                    comment='Issue closed by test_push script.',
    441                                    label_update='',
    442                                    status='WontFix')
    443 
    444 
    445 def check_bug_filed_and_deduped(old_issue_ids):
    446     """Confirm bug related to dummy_Fail was filed and deduped.
    447 
    448     @param old_issue_ids: A list of issue ids that was closed earlier. id of the
    449         new issue must be not in this list.
    450     @raise TestPushException: If auto bug file failed to create a new issue or
    451         dedupe multiple failures.
    452     """
    453     reporter = reporting.Reporter()
    454     issue = reporter.find_issue_by_marker(BUG_ANCHOR)
    455     if not issue:
    456         raise TestPushException('Auto bug file failed. Unable to locate bug '
    457                                 'with marker %s' % BUG_ANCHOR)
    458     if old_issue_ids and issue.id in old_issue_ids:
    459         raise TestPushException('Auto bug file failed to create a new issue. '
    460                                 'id of the old issue found is %d.' % issue.id)
    461     if not ('%s2' % reporter.AUTOFILED_COUNT) in issue.labels:
    462         raise TestPushException(('Auto bug file failed to dedupe for issue %d '
    463                                  'with labels of %s.') %
    464                                 (issue.id, issue.labels))
    465     # Close the bug, and do the search again, which should return None.
    466     reporter.modify_bug_report(issue.id,
    467                                comment='Issue closed by test_push script.',
    468                                label_update='',
    469                                status='WontFix')
    470     second_issue = reporter.find_issue_by_marker(BUG_ANCHOR)
    471     if second_issue:
    472         ids = '%d, %d' % (issue.id, second_issue.id)
    473         raise TestPushException(('Auto bug file failed. Multiple issues (%s) '
    474                                  'filed with marker %s') % (ids, BUG_ANCHOR))
    475     print 'Issue %d was filed and deduped successfully.' % issue.id
    476 
    477 
    478 def check_queue(queue):
    479     """Check the queue for any exception being raised.
    480 
    481     @param queue: Queue used to store exception for parent process to access.
    482     @raise: Any exception found in the queue.
    483     """
    484     if queue.empty():
    485         return
    486     exc_info = queue.get()
    487     # Raise the exception with original backtrace.
    488     print 'Original stack trace of the exception:\n%s' % exc_info[2]
    489     raise exc_info[0](exc_info[1])
    490 
    491 
    492 def main():
    493     """Entry point for test_push script."""
    494     arguments = parse_arguments()
    495 
    496     try:
    497         # Close existing bugs. New bug should be filed in dummy_Fail test.
    498         old_issue_ids = close_bug()
    499 
    500         queue = multiprocessing.Queue()
    501 
    502         push_to_prod_suite = multiprocessing.Process(
    503                 target=test_suite_wrapper,
    504                 args=(queue, PUSH_TO_PROD_SUITE, EXPECTED_TEST_RESULTS,
    505                       arguments))
    506         push_to_prod_suite.start()
    507 
    508         # TODO(dshi): Remove following line after crbug.com/267644 is fixed.
    509         # Also, merge EXPECTED_TEST_RESULTS_AU to EXPECTED_TEST_RESULTS
    510         au_suite = multiprocessing.Process(
    511                 target=test_suite_wrapper,
    512                 args=(queue, AU_SUITE, EXPECTED_TEST_RESULTS_AU,
    513                       arguments))
    514         au_suite.start()
    515 
    516         shard_suite = multiprocessing.Process(
    517                 target=test_suite_wrapper,
    518                 args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,
    519                       arguments, True))
    520         shard_suite.start()
    521 
    522         # suite test with --create_and_return flag
    523         asynchronous_suite = multiprocessing.Process(
    524                 target=test_suite_wrapper,
    525                 args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,
    526                       arguments, True, True))
    527         asynchronous_suite.start()
    528 
    529         bug_filing_checked = False
    530         while (push_to_prod_suite.is_alive() or au_suite.is_alive() or
    531                shard_suite.is_alive() or asynchronous_suite.is_alive()):
    532             check_queue(queue)
    533             # Check bug filing results to fail early if bug filing failed.
    534             if not bug_filing_checked and not push_to_prod_suite.is_alive():
    535                 check_bug_filed_and_deduped(old_issue_ids)
    536                 bug_filing_checked = True
    537             time.sleep(5)
    538 
    539         check_queue(queue)
    540 
    541         push_to_prod_suite.join()
    542         au_suite.join()
    543         shard_suite.join()
    544         asynchronous_suite.join()
    545     except Exception as e:
    546         print 'Test for pushing to prod failed:\n'
    547         print str(e)
    548         # Send out email about the test failure.
    549         if arguments.email:
    550             gmail_lib.send_email(
    551                     arguments.email,
    552                     'Test for pushing to prod failed. Do NOT push!',
    553                     ('Errors occurred during the test:\n\n%s\n\n' % str(e) +
    554                      'run_suite output:\n\n%s' % '\n'.join(run_suite_output)))
    555         raise
    556 
    557     message = ('\nAll tests are completed successfully, prod branch is ready to'
    558                ' be pushed.')
    559     print message
    560     # Send out email about test completed successfully.
    561     if arguments.email:
    562         gmail_lib.send_email(
    563                 arguments.email,
    564                 'Test for pushing to prod completed successfully',
    565                 message)
    566 
    567 
    568 if __name__ == '__main__':
    569     sys.exit(main())
    570