Home | History | Annotate | Download | only in cros
      1 # Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import collections
      6 import httplib
      7 import logging
      8 import os
      9 import re
     10 import socket
     11 import time
     12 
     13 import common
     14 
     15 from autotest_lib.client.common_lib import error
     16 from autotest_lib.client.common_lib.cros import retry
     17 from autotest_lib.server import utils
     18 
     19 
     20 GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
     21 GOOFY_RUNNING = 'RUNNING'
     22 
     23 
     24 class GoofyProxyException(Exception):
     25     """Exception raised when a goofy rpc fails."""
     26     pass
     27 
     28 
     29 class GoofyRuntimeException(Exception):
     30     """Exception raised when something goes wrong while a test is running."""
     31     pass
     32 
     33 
     34 def retry_goofy_rpc(exception_tuple, timeout_min=30):
     35     """A decorator to use with goofy rpcs.
     36 
     37     This decorator tries to recreate the goofy client proxy on
     38     socket error. It will continue trying to do so until it
     39     executes the method without any socket errors or till the
     40     retry.retry decorator hits it's timeout.
     41 
     42     Usage:
     43         If you just want to recreate the proxy:
     44         1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
     45                             timeout_min=<timeout>)
     46         2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
     47             Note: you need to specify the socket.error exception because we
     48             want to retry the call after recreating the proxy.
     49 
     50     @param exception_tuple: A tuple of exceptions to pass to
     51         the retry decorator. Any of these exceptions will result
     52         in retries for the duration of timeout_min.
     53     @param timeout_min: The timeout, in minutes, for which we should
     54         retry the method ignoring any exception in exception_tuple.
     55     """
     56     def inner_decorator(method):
     57         """Inner retry decorator applied to the method.
     58 
     59         @param method: The method that needs to be wrapped in the decorator.
     60 
     61         @return A wrapper function that implements the retry.
     62         """
     63 
     64         @retry.retry(exception_tuple, timeout_min=timeout_min)
     65         def wrapper(*args, **kwargs):
     66             """This wrapper handles socket errors.
     67 
     68             If the method in question:
     69             1. Throws an exception in exception_tuple and it is not a
     70                socket.error, retry for timeout_min through retry.retry.
     71             2. Throws a socket.error, recreate the client proxy, and
     72                retry for timeout_min through retry.retry.
     73             3. Throws an exception not in exception_tuple, fail.
     74             """
     75             try:
     76                 return method(*args, **kwargs)
     77             except socket.error as e:
     78                 goofy_proxy = args[0]
     79                 if type(goofy_proxy) is GoofyProxy:
     80                     logging.warning('Socket error while running factory tests '
     81                                     '%s, recreating goofy proxy.', e)
     82                     goofy_proxy._create_client_proxy(timeout_min=timeout_min)
     83                 else:
     84                     logging.warning('Connectivity was lost and the retry '
     85                                     'decorator was unable to recreate a goofy '
     86                                     'client proxy, args: %s.', args)
     87                 raise
     88 
     89         return wrapper
     90 
     91     return inner_decorator
     92 
     93 
     94 class GoofyProxy(object):
     95     """Client capable of making rpc calls to goofy.
     96 
     97     Methods of this class that can cause goofy to change state
     98     usually need a retry decorator. Methods that have a retry decorator
     99     need to be 'pure', i.e return the same results when called multiple
    100     times with the same argument.
    101 
    102     There are 2 known exceptions this class can deal with, a socket.error
    103     which happens when we try to execute an rpc when the DUT is, say, suspended
    104     and a BadStatusLine, which we get when we try to execute an rpc while the
    105     DUT is going through a factory_restart. Ideally we would like to handle
    106     socket timeouts different from BadStatusLines as we can get connection
    107     errors even when a device reboots and BadStatusLines ususally only when
    108     factory restarts. crbug.com/281714.
    109     """
    110 
    111     # This timeout was arbitrarily chosen as many tests in the factory test
    112     # suite run for days. Ideally we would like to split this into at least 2
    113     # timeouts, one which we use for rpcs that run while no other test is,
    114     # running and is smaller than the second that is designed for use with rpcs
    115     # that might execute simultaneously with a test. The latter needs a longer
    116     # timeout since tests could suspend,resume for a long time, and a call like
    117     # GetGoofyStatus should be tolerant to these suspend/resumes. In designing
    118     # the base timeout one needs to allocate time to component methods of this
    119     # class (such as _set_test_list) as a multiple of the number of rpcs it
    120     # executes.
    121     BASE_RPC_TIMEOUT = 1440
    122     POLLING_INTERVAL = 5
    123     FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*'
    124     UNTAR_COMMAND = 'tar jxf %s -C %s'
    125 
    126 
    127     def __init__(self, host):
    128         """
    129         @param host: The host object representing the DUT running goofy.
    130         """
    131         self._host = host
    132         self._raw_stop_running_tests()
    133         self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
    134 
    135 
    136     def _create_client_proxy(self, timeout_min=30):
    137         """Create a goofy client proxy.
    138 
    139         Ping the host till it's up, then proceed to create a goofy proxy. We
    140         don't wrap this method with a retry because it's used in the retry
    141         decorator itself.
    142         """
    143 
    144         # We don't ssh ping here as there is a potential dealy in O(minutes)
    145         # with our ssh command against a sleeping DUT, once it wakes up, and
    146         # that will lead to significant overhead incurred over many reboots.
    147         self._host.ping_wait_up(timeout_min)
    148         logging.info('Host is pingable, creating goofy client proxy')
    149         self._client = self._host.rpc_server_tracker.jsonrpc_connect(
    150                 GOOFY_JSONRPC_SERVER_PORT)
    151 
    152 
    153     @retry.retry((httplib.BadStatusLine, socket.error),
    154                  timeout_min=BASE_RPC_TIMEOUT)
    155     def _raw_stop_running_tests(self):
    156         """Stop running tests by shelling out to the DUT.
    157 
    158         Use this method only before we have actually created the client
    159         proxy, as shelling out has several pitfalls. We need to stop all
    160         tests in a retry loop because tests will start executing as soon
    161         as we have reimaged a DUT and trying to create the proxy while
    162         the DUT is rebooting will lead to a spurious failure.
    163 
    164         Note that we use the plain retry decorator for this method since
    165         we don't need to recreate the client proxy on failure.
    166         """
    167         logging.info('Stopping all tests and clearing factory state')
    168         self._host.run('factory clear')
    169 
    170 
    171     @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
    172                      timeout_min=BASE_RPC_TIMEOUT)
    173     def _get_goofy_status(self):
    174         """Return status of goofy, ignoring socket timeouts and http exceptions.
    175         """
    176         status = self._client.GetGoofyStatus().get('status')
    177         return status
    178 
    179 
    180     def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
    181         """Wait till goofy is running or a timeout occurs.
    182 
    183         @param timeout_min: Minutes to wait before timing this call out.
    184         """
    185         current_time = time.time()
    186         timeout_secs = timeout_min * 60
    187         logging.info('Waiting on goofy')
    188         while self._get_goofy_status() != GOOFY_RUNNING:
    189             if time.time() - current_time > timeout_secs:
    190                 break
    191         return
    192 
    193 
    194     @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
    195     def _set_test_list(self, next_list):
    196         """Set the given test list for execution and turn on test automation.
    197 
    198         Confirm that the given test list is a test that has been baked into
    199         the image, then run it. Some test lists are configured to start
    200         execution automatically when we call SetTestList, while others wait
    201         for a corresponding RunTest.
    202 
    203         @param next_list: The name of the test list.
    204 
    205         @raise jsonrpclib.ProtocolError: If the test list we're trying to switch
    206                                          to isn't on the DUT.
    207         """
    208 
    209         # We can get a BadStatus line on 2 occassions:
    210         # 1. As part of SwitchTestList goofy performs a factory restart, which
    211         # will throw a BadStatusLine because the rpc can't exit cleanly. We
    212         # don't want to retry on this exception, since we've already set the
    213         # right test list.
    214         # 2. If we try to set a test list while goofy is already down
    215         # (from a previous factory restart). In this case we wouldn't have
    216         # set the new test list, because we coulnd't connect to goofy.
    217         # To properly set a new test list it's important to wait till goofy is
    218         # up before attempting to set the test list, while being aware that the
    219         # preceding httplib error is from the rpc we just executed leading to
    220         # a factory restart. Also note that if the test list is not already on
    221         # the DUT this method will fail, emitting the possible test lists one
    222         # can switch to.
    223         self._wait_for_goofy()
    224         logging.info('Switching to test list %s', next_list)
    225         try:
    226             # Enable full factory test automation. Full test automation mode
    227             # skips all manual tests and test barriers, which is what we want in
    228             # the test lab. There are other automation modes: partial and none.
    229             # In partial automation mode manual tests and barrier are enabled
    230             # and user intervention is required; none disables automation.
    231             self._client.SwitchTestList(next_list, 'full')
    232         except httplib.BadStatusLine:
    233             logging.info('Switched to list %s, goofy restarting', next_list)
    234 
    235 
    236     @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
    237                      timeout_min=BASE_RPC_TIMEOUT*2)
    238     def _stop_running_tests(self):
    239         """Stop all running tests.
    240 
    241         Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
    242         is suspended or rebooting.
    243         """
    244         logging.info('Stopping tests.')
    245         self._client.StopTest()
    246 
    247 
    248     def _get_test_map(self):
    249         """Get a mapping of test suites -> tests.
    250 
    251         Ignore entries for tests that don't have a path.
    252 
    253         @return: A dictionary of the form
    254                  {'suite_name': ['suite_name.path_to_test', ...]}.
    255         """
    256         test_all = set([test['path'] for test in self._client.GetTests()
    257                         if test.get('path')])
    258 
    259         test_map = collections.defaultdict(list)
    260         for names in test_all:
    261             test_map[names.split('.')[0]].append(names)
    262         return test_map
    263 
    264 
    265     def _log_test_results(self, test_status, current_suite):
    266         """Format test status results and write them to status.log.
    267 
    268         @param test_status: The status dictionary of a single test.
    269         @param current_suite: The current suite name.
    270         """
    271         try:
    272             self._host.job.record('INFO', None, None,
    273                                   'suite %s, test %s, status: %s' %
    274                                   (current_suite, test_status.get('path'),
    275                                    test_status.get('status')))
    276         except AttributeError as e:
    277             logging.error('Could not gather results for current test: %s', e)
    278 
    279 
    280     @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
    281                      timeout_min=BASE_RPC_TIMEOUT*2)
    282     def _get_test_info(self, test_name):
    283         """Get the status of one test.
    284 
    285         @param test_name: The name of the test we need the status of.
    286 
    287         @return: The entry for the test in the status dictionary.
    288         """
    289         for test in self._client.GetTests():
    290             if test['path'] == test_name:
    291                 return test
    292         raise ValueError('Could not find test_name %s in _get_test_info.' %
    293                           test_name)
    294 
    295 
    296     @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
    297                      timeout_min=BASE_RPC_TIMEOUT*2)
    298     def _get_test_run_info(self, run_id):
    299         """Get the information about the given test run.
    300 
    301         @param run_id: The ID of the test run.
    302 
    303         @return: A dict of test run status.
    304         """
    305         return self._client.GetTestRunStatus(run_id)
    306 
    307 
    308     def _wait_on_run(self, run_id):
    309         """Wait until the given test run to end.
    310 
    311         @param run_id: The ID of the test run.
    312 
    313         @raises GoofyRuntimeException: If the test run does not finish
    314             gracefully.
    315         """
    316         finished_tests = set()
    317         run_info = self._get_test_run_info(run_id)
    318         while run_info['status'] == 'RUNNING':
    319             finished = [(t['path'], t['status']) for t in
    320                         run_info['scheduled_tests']
    321                         if t['status'] in ('PASSED', 'FAILED')]
    322             for t in finished:
    323                 if t not in finished_tests:
    324                     logging.info('[%s] %s', t[1], t[0])
    325                     finished_tests.add(t)
    326             time.sleep(self.POLLING_INTERVAL)
    327             run_info = self._get_test_run_info(run_id)
    328         if run_info['status'] != 'FINISHED':
    329             raise GoofyRuntimeException(
    330                     'The requested test run was interrupted.')
    331 
    332 
    333     def _synchronous_run_suite(self, suite_name):
    334         """Run one suite and wait for it to finish.
    335 
    336         Will start a test run for the specified suite_name and wait until it
    337         ends.
    338 
    339         @param suite_name: The name of the suite to wait for.
    340 
    341         @raises GoofyProxyException: If the status of the suite
    342             doesn't switch to active after we call RunTest.
    343 
    344         @return: The result of the suite.
    345         """
    346         logging.info('Starting suite: %s', suite_name)
    347         run_id = self._client.RunTest(suite_name)
    348         self._wait_on_run(run_id)
    349         return self._get_test_run_info(run_id)
    350 
    351 
    352     def monitor_tests(self, test_list):
    353         """Run a test list.
    354 
    355         Will run each suite in the given list in sequence, starting each one
    356         by name and waiting on its results. This method makes the following
    357         assumptions:
    358             - A test list is made up of self contained suites.
    359             - These suites trigger several things in parallel.
    360             - After a suite finishes it leaves goofy in an idle state.
    361 
    362         It is not safe to pull results for individual tests during the suite
    363         as the device could be rebooting, or goofy could be under stress.
    364         Instead, this method synchronously waits on an entire suite, then
    365         asks goofy for the status of each test in the suite. Since certain
    366         test lists automatically start and others don't, this method stops
    367         test list execution regardless, and sequentially triggers each suite.
    368 
    369         @param test_list: The test list to run.
    370         """
    371         self._set_test_list(test_list)
    372         self._wait_for_goofy()
    373         self._stop_running_tests()
    374 
    375         test_map = self._get_test_map()
    376         if test_map:
    377             logging.info('About to execute tests: %s', test_map)
    378         else:
    379             raise GoofyRuntimeException('Test map is empty, you might have an '
    380                                         'error in your test_list.')
    381 
    382 
    383         for current_suite in test_map.keys():
    384             logging.info('Processing suite %s', current_suite)
    385 
    386             result = self._synchronous_run_suite(current_suite)
    387             logging.info(result)
    388 
    389             for test_names in test_map.get(current_suite):
    390                 self._log_test_results(self._get_test_info(test_names),
    391                                        current_suite)
    392 
    393 
    394     @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
    395                      timeout_min=BASE_RPC_TIMEOUT*2)
    396     def get_results(self, resultsdir):
    397         """Copies results from the DUT to a local results directory.
    398 
    399         Copy the tarball over to the results folder, untar, and delete the
    400         tarball if everything was successful. This will effectively place
    401         all the logs relevant to factory testing in the job's results folder.
    402 
    403         @param resultsdir: The directory in which to untar the contents of the
    404                            tarball factory_bug generates.
    405         """
    406         logging.info('Getting results logs for test_list.')
    407 
    408         try:
    409             factory_bug_log = self._host.run('factory_bug').stderr
    410         except error.CmdError as e:
    411             logging.error('Could not execute factory_bug: %s', e)
    412             return
    413 
    414         try:
    415             factory_bug_tar = re.match(self.FACTORY_BUG_RE,
    416                                        factory_bug_log).groups(1)[0]
    417         except (IndexError, AttributeError):
    418             logging.error('could not collect logs for factory results, '
    419                           'factory bug returned %s', factory_bug_log)
    420             return
    421 
    422         factory_bug_tar_file = os.path.basename(factory_bug_tar)
    423         local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
    424 
    425         try:
    426             self._host.get_file(factory_bug_tar, local_factory_bug_tar)
    427         except error.AutoservRunError as e:
    428             logging.error('Failed to pull back the results tarball: %s', e)
    429             return
    430 
    431         try:
    432             utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
    433         except error.CmdError as e:
    434             logging.error('Failed to untar the results tarball: %s', e)
    435             return
    436         finally:
    437             if os.path.exists(local_factory_bug_tar):
    438                 os.remove(local_factory_bug_tar)
    439