1 # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import collections 6 import httplib 7 import logging 8 import os 9 import re 10 import socket 11 import time 12 13 import common 14 15 from autotest_lib.client.common_lib import error 16 from autotest_lib.client.common_lib.cros import retry 17 from autotest_lib.server import utils 18 19 20 GOOFY_JSONRPC_SERVER_PORT = 0x0FAC 21 GOOFY_RUNNING = 'RUNNING' 22 23 24 class GoofyProxyException(Exception): 25 """Exception raised when a goofy rpc fails.""" 26 pass 27 28 29 class GoofyRuntimeException(Exception): 30 """Exception raised when something goes wrong while a test is running.""" 31 pass 32 33 34 def retry_goofy_rpc(exception_tuple, timeout_min=30): 35 """A decorator to use with goofy rpcs. 36 37 This decorator tries to recreate the goofy client proxy on 38 socket error. It will continue trying to do so until it 39 executes the method without any socket errors or till the 40 retry.retry decorator hits it's timeout. 41 42 Usage: 43 If you just want to recreate the proxy: 44 1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error), 45 timeout_min=<timeout>) 46 2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>) 47 Note: you need to specify the socket.error exception because we 48 want to retry the call after recreating the proxy. 49 50 @param exception_tuple: A tuple of exceptions to pass to 51 the retry decorator. Any of these exceptions will result 52 in retries for the duration of timeout_min. 53 @param timeout_min: The timeout, in minutes, for which we should 54 retry the method ignoring any exception in exception_tuple. 55 """ 56 def inner_decorator(method): 57 """Inner retry decorator applied to the method. 58 59 @param method: The method that needs to be wrapped in the decorator. 60 61 @return A wrapper function that implements the retry. 62 """ 63 64 @retry.retry(exception_tuple, timeout_min=timeout_min) 65 def wrapper(*args, **kwargs): 66 """This wrapper handles socket errors. 67 68 If the method in question: 69 1. Throws an exception in exception_tuple and it is not a 70 socket.error, retry for timeout_min through retry.retry. 71 2. Throws a socket.error, recreate the client proxy, and 72 retry for timeout_min through retry.retry. 73 3. Throws an exception not in exception_tuple, fail. 74 """ 75 try: 76 return method(*args, **kwargs) 77 except socket.error as e: 78 goofy_proxy = args[0] 79 if type(goofy_proxy) is GoofyProxy: 80 logging.warning('Socket error while running factory tests ' 81 '%s, recreating goofy proxy.', e) 82 goofy_proxy._create_client_proxy(timeout_min=timeout_min) 83 else: 84 logging.warning('Connectivity was lost and the retry ' 85 'decorator was unable to recreate a goofy ' 86 'client proxy, args: %s.', args) 87 raise 88 89 return wrapper 90 91 return inner_decorator 92 93 94 class GoofyProxy(object): 95 """Client capable of making rpc calls to goofy. 96 97 Methods of this class that can cause goofy to change state 98 usually need a retry decorator. Methods that have a retry decorator 99 need to be 'pure', i.e return the same results when called multiple 100 times with the same argument. 101 102 There are 2 known exceptions this class can deal with, a socket.error 103 which happens when we try to execute an rpc when the DUT is, say, suspended 104 and a BadStatusLine, which we get when we try to execute an rpc while the 105 DUT is going through a factory_restart. Ideally we would like to handle 106 socket timeouts different from BadStatusLines as we can get connection 107 errors even when a device reboots and BadStatusLines ususally only when 108 factory restarts. crbug.com/281714. 109 """ 110 111 # This timeout was arbitrarily chosen as many tests in the factory test 112 # suite run for days. Ideally we would like to split this into at least 2 113 # timeouts, one which we use for rpcs that run while no other test is, 114 # running and is smaller than the second that is designed for use with rpcs 115 # that might execute simultaneously with a test. The latter needs a longer 116 # timeout since tests could suspend,resume for a long time, and a call like 117 # GetGoofyStatus should be tolerant to these suspend/resumes. In designing 118 # the base timeout one needs to allocate time to component methods of this 119 # class (such as _set_test_list) as a multiple of the number of rpcs it 120 # executes. 121 BASE_RPC_TIMEOUT = 1440 122 POLLING_INTERVAL = 5 123 FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*' 124 UNTAR_COMMAND = 'tar jxf %s -C %s' 125 126 127 def __init__(self, host): 128 """ 129 @param host: The host object representing the DUT running goofy. 130 """ 131 self._host = host 132 self._raw_stop_running_tests() 133 self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT) 134 135 136 def _create_client_proxy(self, timeout_min=30): 137 """Create a goofy client proxy. 138 139 Ping the host till it's up, then proceed to create a goofy proxy. We 140 don't wrap this method with a retry because it's used in the retry 141 decorator itself. 142 """ 143 144 # We don't ssh ping here as there is a potential dealy in O(minutes) 145 # with our ssh command against a sleeping DUT, once it wakes up, and 146 # that will lead to significant overhead incurred over many reboots. 147 self._host.ping_wait_up(timeout_min) 148 logging.info('Host is pingable, creating goofy client proxy') 149 self._client = self._host.rpc_server_tracker.jsonrpc_connect( 150 GOOFY_JSONRPC_SERVER_PORT) 151 152 153 @retry.retry((httplib.BadStatusLine, socket.error), 154 timeout_min=BASE_RPC_TIMEOUT) 155 def _raw_stop_running_tests(self): 156 """Stop running tests by shelling out to the DUT. 157 158 Use this method only before we have actually created the client 159 proxy, as shelling out has several pitfalls. We need to stop all 160 tests in a retry loop because tests will start executing as soon 161 as we have reimaged a DUT and trying to create the proxy while 162 the DUT is rebooting will lead to a spurious failure. 163 164 Note that we use the plain retry decorator for this method since 165 we don't need to recreate the client proxy on failure. 166 """ 167 logging.info('Stopping all tests and clearing factory state') 168 self._host.run('factory clear') 169 170 171 @retry_goofy_rpc((httplib.BadStatusLine, socket.error), 172 timeout_min=BASE_RPC_TIMEOUT) 173 def _get_goofy_status(self): 174 """Return status of goofy, ignoring socket timeouts and http exceptions. 175 """ 176 status = self._client.GetGoofyStatus().get('status') 177 return status 178 179 180 def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2): 181 """Wait till goofy is running or a timeout occurs. 182 183 @param timeout_min: Minutes to wait before timing this call out. 184 """ 185 current_time = time.time() 186 timeout_secs = timeout_min * 60 187 logging.info('Waiting on goofy') 188 while self._get_goofy_status() != GOOFY_RUNNING: 189 if time.time() - current_time > timeout_secs: 190 break 191 return 192 193 194 @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2) 195 def _set_test_list(self, next_list): 196 """Set the given test list for execution and turn on test automation. 197 198 Confirm that the given test list is a test that has been baked into 199 the image, then run it. Some test lists are configured to start 200 execution automatically when we call SetTestList, while others wait 201 for a corresponding RunTest. 202 203 @param next_list: The name of the test list. 204 205 @raise jsonrpclib.ProtocolError: If the test list we're trying to switch 206 to isn't on the DUT. 207 """ 208 209 # We can get a BadStatus line on 2 occassions: 210 # 1. As part of SwitchTestList goofy performs a factory restart, which 211 # will throw a BadStatusLine because the rpc can't exit cleanly. We 212 # don't want to retry on this exception, since we've already set the 213 # right test list. 214 # 2. If we try to set a test list while goofy is already down 215 # (from a previous factory restart). In this case we wouldn't have 216 # set the new test list, because we coulnd't connect to goofy. 217 # To properly set a new test list it's important to wait till goofy is 218 # up before attempting to set the test list, while being aware that the 219 # preceding httplib error is from the rpc we just executed leading to 220 # a factory restart. Also note that if the test list is not already on 221 # the DUT this method will fail, emitting the possible test lists one 222 # can switch to. 223 self._wait_for_goofy() 224 logging.info('Switching to test list %s', next_list) 225 try: 226 # Enable full factory test automation. Full test automation mode 227 # skips all manual tests and test barriers, which is what we want in 228 # the test lab. There are other automation modes: partial and none. 229 # In partial automation mode manual tests and barrier are enabled 230 # and user intervention is required; none disables automation. 231 self._client.SwitchTestList(next_list, 'full') 232 except httplib.BadStatusLine: 233 logging.info('Switched to list %s, goofy restarting', next_list) 234 235 236 @retry_goofy_rpc((httplib.BadStatusLine, socket.error), 237 timeout_min=BASE_RPC_TIMEOUT*2) 238 def _stop_running_tests(self): 239 """Stop all running tests. 240 241 Wrap the StopTest rpc so we can attempt to stop tests even while a DUT 242 is suspended or rebooting. 243 """ 244 logging.info('Stopping tests.') 245 self._client.StopTest() 246 247 248 def _get_test_map(self): 249 """Get a mapping of test suites -> tests. 250 251 Ignore entries for tests that don't have a path. 252 253 @return: A dictionary of the form 254 {'suite_name': ['suite_name.path_to_test', ...]}. 255 """ 256 test_all = set([test['path'] for test in self._client.GetTests() 257 if test.get('path')]) 258 259 test_map = collections.defaultdict(list) 260 for names in test_all: 261 test_map[names.split('.')[0]].append(names) 262 return test_map 263 264 265 def _log_test_results(self, test_status, current_suite): 266 """Format test status results and write them to status.log. 267 268 @param test_status: The status dictionary of a single test. 269 @param current_suite: The current suite name. 270 """ 271 try: 272 self._host.job.record('INFO', None, None, 273 'suite %s, test %s, status: %s' % 274 (current_suite, test_status.get('path'), 275 test_status.get('status'))) 276 except AttributeError as e: 277 logging.error('Could not gather results for current test: %s', e) 278 279 280 @retry_goofy_rpc((httplib.BadStatusLine, socket.error), 281 timeout_min=BASE_RPC_TIMEOUT*2) 282 def _get_test_info(self, test_name): 283 """Get the status of one test. 284 285 @param test_name: The name of the test we need the status of. 286 287 @return: The entry for the test in the status dictionary. 288 """ 289 for test in self._client.GetTests(): 290 if test['path'] == test_name: 291 return test 292 raise ValueError('Could not find test_name %s in _get_test_info.' % 293 test_name) 294 295 296 @retry_goofy_rpc((httplib.BadStatusLine, socket.error), 297 timeout_min=BASE_RPC_TIMEOUT*2) 298 def _get_test_run_info(self, run_id): 299 """Get the information about the given test run. 300 301 @param run_id: The ID of the test run. 302 303 @return: A dict of test run status. 304 """ 305 return self._client.GetTestRunStatus(run_id) 306 307 308 def _wait_on_run(self, run_id): 309 """Wait until the given test run to end. 310 311 @param run_id: The ID of the test run. 312 313 @raises GoofyRuntimeException: If the test run does not finish 314 gracefully. 315 """ 316 finished_tests = set() 317 run_info = self._get_test_run_info(run_id) 318 while run_info['status'] == 'RUNNING': 319 finished = [(t['path'], t['status']) for t in 320 run_info['scheduled_tests'] 321 if t['status'] in ('PASSED', 'FAILED')] 322 for t in finished: 323 if t not in finished_tests: 324 logging.info('[%s] %s', t[1], t[0]) 325 finished_tests.add(t) 326 time.sleep(self.POLLING_INTERVAL) 327 run_info = self._get_test_run_info(run_id) 328 if run_info['status'] != 'FINISHED': 329 raise GoofyRuntimeException( 330 'The requested test run was interrupted.') 331 332 333 def _synchronous_run_suite(self, suite_name): 334 """Run one suite and wait for it to finish. 335 336 Will start a test run for the specified suite_name and wait until it 337 ends. 338 339 @param suite_name: The name of the suite to wait for. 340 341 @raises GoofyProxyException: If the status of the suite 342 doesn't switch to active after we call RunTest. 343 344 @return: The result of the suite. 345 """ 346 logging.info('Starting suite: %s', suite_name) 347 run_id = self._client.RunTest(suite_name) 348 self._wait_on_run(run_id) 349 return self._get_test_run_info(run_id) 350 351 352 def monitor_tests(self, test_list): 353 """Run a test list. 354 355 Will run each suite in the given list in sequence, starting each one 356 by name and waiting on its results. This method makes the following 357 assumptions: 358 - A test list is made up of self contained suites. 359 - These suites trigger several things in parallel. 360 - After a suite finishes it leaves goofy in an idle state. 361 362 It is not safe to pull results for individual tests during the suite 363 as the device could be rebooting, or goofy could be under stress. 364 Instead, this method synchronously waits on an entire suite, then 365 asks goofy for the status of each test in the suite. Since certain 366 test lists automatically start and others don't, this method stops 367 test list execution regardless, and sequentially triggers each suite. 368 369 @param test_list: The test list to run. 370 """ 371 self._set_test_list(test_list) 372 self._wait_for_goofy() 373 self._stop_running_tests() 374 375 test_map = self._get_test_map() 376 if test_map: 377 logging.info('About to execute tests: %s', test_map) 378 else: 379 raise GoofyRuntimeException('Test map is empty, you might have an ' 380 'error in your test_list.') 381 382 383 for current_suite in test_map.keys(): 384 logging.info('Processing suite %s', current_suite) 385 386 result = self._synchronous_run_suite(current_suite) 387 logging.info(result) 388 389 for test_names in test_map.get(current_suite): 390 self._log_test_results(self._get_test_info(test_names), 391 current_suite) 392 393 394 @retry_goofy_rpc((httplib.BadStatusLine, socket.error), 395 timeout_min=BASE_RPC_TIMEOUT*2) 396 def get_results(self, resultsdir): 397 """Copies results from the DUT to a local results directory. 398 399 Copy the tarball over to the results folder, untar, and delete the 400 tarball if everything was successful. This will effectively place 401 all the logs relevant to factory testing in the job's results folder. 402 403 @param resultsdir: The directory in which to untar the contents of the 404 tarball factory_bug generates. 405 """ 406 logging.info('Getting results logs for test_list.') 407 408 try: 409 factory_bug_log = self._host.run('factory_bug').stderr 410 except error.CmdError as e: 411 logging.error('Could not execute factory_bug: %s', e) 412 return 413 414 try: 415 factory_bug_tar = re.match(self.FACTORY_BUG_RE, 416 factory_bug_log).groups(1)[0] 417 except (IndexError, AttributeError): 418 logging.error('could not collect logs for factory results, ' 419 'factory bug returned %s', factory_bug_log) 420 return 421 422 factory_bug_tar_file = os.path.basename(factory_bug_tar) 423 local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file) 424 425 try: 426 self._host.get_file(factory_bug_tar, local_factory_bug_tar) 427 except error.AutoservRunError as e: 428 logging.error('Failed to pull back the results tarball: %s', e) 429 return 430 431 try: 432 utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir)) 433 except error.CmdError as e: 434 logging.error('Failed to untar the results tarball: %s', e) 435 return 436 finally: 437 if os.path.exists(local_factory_bug_tar): 438 os.remove(local_factory_bug_tar) 439