1 #!/usr/bin/env python 2 # 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 4 # Use of this source code is governed by a BSD-style license that can be 5 # found in the LICENSE file. 6 7 """Helper script to shard build bot steps and save results to disk. 8 9 Our buildbot infrastructure requires each slave to run steps serially. 10 This is sub-optimal for android, where these steps can run independently on 11 multiple connected devices. 12 13 The buildbots will run this script multiple times per cycle: 14 - First: all steps listed in -s in will be executed in parallel using all 15 connected devices. Step results will be pickled to disk. Each step has a unique 16 name. The result code will be ignored if the step name is listed in 17 --flaky_steps. 18 The buildbot will treat this step as a regular step, and will not process any 19 graph data. 20 21 - Then, with -p STEP_NAME: at this stage, we'll simply print the file with the 22 step results previously saved. The buildbot will then process the graph data 23 accordingly. 24 25 The JSON steps file contains a dictionary in the format: 26 { 27 "step_name_foo": "script_to_execute foo", 28 "step_name_bar": "script_to_execute bar" 29 } 30 31 The JSON flaky steps file contains a list with step names which results should 32 be ignored: 33 [ 34 "step_name_foo", 35 "step_name_bar" 36 ] 37 38 Note that script_to_execute necessarily have to take at least the following 39 options: 40 --device: the serial number to be passed to all adb commands. 41 --keep_test_server_ports: indicates it's being run as a shard, and shouldn't 42 reset test server port allocation. 43 """ 44 45 46 import datetime 47 import json 48 import logging 49 import multiprocessing 50 import optparse 51 import pexpect 52 import pickle 53 import os 54 import signal 55 import shutil 56 import sys 57 import time 58 59 from pylib import android_commands 60 from pylib import cmd_helper 61 from pylib import constants 62 from pylib import forwarder 63 from pylib import ports 64 65 66 _OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results') 67 68 69 def _SaveResult(result): 70 with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f: 71 f.write(pickle.dumps(result)) 72 73 74 def _RunStepsPerDevice(steps): 75 results = [] 76 for step in steps: 77 start_time = datetime.datetime.now() 78 print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'], 79 start_time, step['device']) 80 output, exit_code = pexpect.run( 81 step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT), 82 withexitstatus=True, logfile=sys.stdout, timeout=1800, 83 env=os.environ) 84 exit_code = exit_code or 0 85 end_time = datetime.datetime.now() 86 exit_msg = '%s %s' % (exit_code, 87 '(ignored, flaky step)' if step['is_flaky'] else '') 88 print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'], 89 end_time, step['device']) 90 if step['is_flaky']: 91 exit_code = 0 92 result = {'name': step['name'], 93 'output': output, 94 'exit_code': exit_code, 95 'total_time': (end_time - start_time).seconds, 96 'device': step['device']} 97 _SaveResult(result) 98 results += [result] 99 return results 100 101 102 def _RunShardedSteps(steps, flaky_steps, devices): 103 assert steps 104 assert devices, 'No devices connected?' 105 if os.path.exists(_OUTPUT_DIR): 106 assert '/step_results' in _OUTPUT_DIR 107 shutil.rmtree(_OUTPUT_DIR) 108 if not os.path.exists(_OUTPUT_DIR): 109 os.makedirs(_OUTPUT_DIR) 110 step_names = sorted(steps.keys()) 111 all_params = [] 112 num_devices = len(devices) 113 shard_size = (len(steps) + num_devices - 1) / num_devices 114 for i, device in enumerate(devices): 115 steps_per_device = [] 116 for s in steps.keys()[i * shard_size:(i + 1) * shard_size]: 117 steps_per_device += [{'name': s, 118 'device': device, 119 'is_flaky': s in flaky_steps, 120 'cmd': steps[s] + ' --device ' + device + 121 ' --keep_test_server_ports'}] 122 all_params += [steps_per_device] 123 print 'Start sharding (note: output is not synchronized...)' 124 print '*' * 80 125 start_time = datetime.datetime.now() 126 pool = multiprocessing.Pool(processes=num_devices) 127 async_results = pool.map_async(_RunStepsPerDevice, all_params) 128 results_per_device = async_results.get(999999) 129 end_time = datetime.datetime.now() 130 print '*' * 80 131 print 'Finished sharding.' 132 print 'Summary' 133 total_time = 0 134 for results in results_per_device: 135 for result in results: 136 print('%s : exit_code=%d in %d secs at %s' % 137 (result['name'], result['exit_code'], result['total_time'], 138 result['device'])) 139 total_time += result['total_time'] 140 print 'Step time: %d secs' % ((end_time - start_time).seconds) 141 print 'Bots time: %d secs' % total_time 142 # No exit_code for the sharding step: the individual _PrintResults step 143 # will return the corresponding exit_code. 144 return 0 145 146 147 def _PrintStepOutput(step_name): 148 file_name = os.path.join(_OUTPUT_DIR, step_name) 149 if not os.path.exists(file_name): 150 print 'File not found ', file_name 151 return 1 152 with file(file_name, 'r') as f: 153 result = pickle.loads(f.read()) 154 print result['output'] 155 return result['exit_code'] 156 157 158 def _PrintAllStepsOutput(steps): 159 with file(steps, 'r') as f: 160 steps = json.load(f) 161 ret = 0 162 for step_name in steps.keys(): 163 ret |= _PrintStepOutput(step_name) 164 return ret 165 166 167 def _KillPendingServers(): 168 for retry in range(5): 169 for server in ['lighttpd', 'web-page-replay']: 170 pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server]) 171 pids = [pid.strip() for pid in pids.split('\n') if pid.strip()] 172 for pid in pids: 173 try: 174 logging.warning('Killing %s %s', server, pid) 175 os.kill(int(pid), signal.SIGQUIT) 176 except Exception as e: 177 logging.warning('Failed killing %s %s %s', server, pid, e) 178 # Restart the adb server with taskset to set a single CPU affinity. 179 cmd_helper.RunCmd(['adb', 'kill-server']) 180 cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'start-server']) 181 cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'root']) 182 i = 1 183 while not android_commands.GetAttachedDevices(): 184 time.sleep(i) 185 i *= 2 186 if i > 10: 187 break 188 189 190 def main(argv): 191 parser = optparse.OptionParser() 192 parser.add_option('-s', '--steps', 193 help='A JSON file containing all the steps to be ' 194 'sharded.') 195 parser.add_option('--flaky_steps', 196 help='A JSON file containing steps that are flaky and ' 197 'will have its exit code ignored.') 198 parser.add_option('-p', '--print_results', 199 help='Only prints the results for the previously ' 200 'executed step, do not run it again.') 201 parser.add_option('-P', '--print_all', 202 help='Only prints the results for the previously ' 203 'executed steps, do not run them again.') 204 options, urls = parser.parse_args(argv) 205 if options.print_results: 206 return _PrintStepOutput(options.print_results) 207 if options.print_all: 208 return _PrintAllStepsOutput(options.print_all) 209 210 # At this point, we should kill everything that may have been left over from 211 # previous runs. 212 _KillPendingServers() 213 214 forwarder.Forwarder.UseMultiprocessing() 215 216 # Reset the test port allocation. It's important to do it before starting 217 # to dispatch any step. 218 if not ports.ResetTestServerPortAllocation(): 219 raise Exception('Failed to reset test server port.') 220 221 # Sort the devices so that we'll try to always run a step in the same device. 222 devices = sorted(android_commands.GetAttachedDevices()) 223 if not devices: 224 print 'You must attach a device' 225 return 1 226 227 with file(options.steps, 'r') as f: 228 steps = json.load(f) 229 flaky_steps = [] 230 if options.flaky_steps: 231 with file(options.flaky_steps, 'r') as f: 232 flaky_steps = json.load(f) 233 return _RunShardedSteps(steps, flaky_steps, devices) 234 235 236 if __name__ == '__main__': 237 sys.exit(main(sys.argv)) 238