Home | History | Annotate | Download | only in android
      1 #!/usr/bin/env python
      2 #
      3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 """Helper script to shard build bot steps and save results to disk.
      8 
      9 Our buildbot infrastructure requires each slave to run steps serially.
     10 This is sub-optimal for android, where these steps can run independently on
     11 multiple connected devices.
     12 
     13 The buildbots will run this script multiple times per cycle:
     14 - First: all steps listed in -s in will be executed in parallel using all
     15 connected devices. Step results will be pickled to disk. Each step has a unique
     16 name. The result code will be ignored if the step name is listed in
     17 --flaky_steps.
     18 The buildbot will treat this step as a regular step, and will not process any
     19 graph data.
     20 
     21 - Then, with -p STEP_NAME: at this stage, we'll simply print the file with the
     22 step results previously saved. The buildbot will then process the graph data
     23 accordingly.
     24 
     25 The JSON steps file contains a dictionary in the format:
     26 {
     27   "step_name_foo": "script_to_execute foo",
     28   "step_name_bar": "script_to_execute bar"
     29 }
     30 
     31 The JSON flaky steps file contains a list with step names which results should
     32 be ignored:
     33 [
     34   "step_name_foo",
     35   "step_name_bar"
     36 ]
     37 
     38 Note that script_to_execute necessarily have to take at least the following
     39 options:
     40   --device: the serial number to be passed to all adb commands.
     41   --keep_test_server_ports: indicates it's being run as a shard, and shouldn't
     42   reset test server port allocation.
     43 """
     44 
     45 
     46 import datetime
     47 import json
     48 import logging
     49 import multiprocessing
     50 import optparse
     51 import pexpect
     52 import pickle
     53 import os
     54 import signal
     55 import shutil
     56 import sys
     57 import time
     58 
     59 from pylib import android_commands
     60 from pylib import cmd_helper
     61 from pylib import constants
     62 from pylib import forwarder
     63 from pylib import ports
     64 
     65 
     66 _OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results')
     67 
     68 
     69 def _SaveResult(result):
     70   with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f:
     71     f.write(pickle.dumps(result))
     72 
     73 
     74 def _RunStepsPerDevice(steps):
     75   results = []
     76   for step in steps:
     77     start_time = datetime.datetime.now()
     78     print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'],
     79                                         start_time, step['device'])
     80     output, exit_code  = pexpect.run(
     81         step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT),
     82         withexitstatus=True, logfile=sys.stdout, timeout=1800,
     83         env=os.environ)
     84     exit_code = exit_code or 0
     85     end_time = datetime.datetime.now()
     86     exit_msg = '%s %s' % (exit_code,
     87                           '(ignored, flaky step)' if step['is_flaky'] else '')
     88     print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'],
     89                                            end_time, step['device'])
     90     if step['is_flaky']:
     91       exit_code = 0
     92     result = {'name': step['name'],
     93               'output': output,
     94               'exit_code': exit_code,
     95               'total_time': (end_time - start_time).seconds,
     96               'device': step['device']}
     97     _SaveResult(result)
     98     results += [result]
     99   return results
    100 
    101 
    102 def _RunShardedSteps(steps, flaky_steps, devices):
    103   assert steps
    104   assert devices, 'No devices connected?'
    105   if os.path.exists(_OUTPUT_DIR):
    106     assert '/step_results' in _OUTPUT_DIR
    107     shutil.rmtree(_OUTPUT_DIR)
    108   if not os.path.exists(_OUTPUT_DIR):
    109     os.makedirs(_OUTPUT_DIR)
    110   step_names = sorted(steps.keys())
    111   all_params = []
    112   num_devices = len(devices)
    113   shard_size = (len(steps) + num_devices - 1) / num_devices
    114   for i, device in enumerate(devices):
    115     steps_per_device = []
    116     for s in steps.keys()[i * shard_size:(i + 1) * shard_size]:
    117       steps_per_device += [{'name': s,
    118                             'device': device,
    119                             'is_flaky': s in flaky_steps,
    120                             'cmd': steps[s] + ' --device ' + device +
    121                             ' --keep_test_server_ports'}]
    122     all_params += [steps_per_device]
    123   print 'Start sharding (note: output is not synchronized...)'
    124   print '*' * 80
    125   start_time = datetime.datetime.now()
    126   pool = multiprocessing.Pool(processes=num_devices)
    127   async_results = pool.map_async(_RunStepsPerDevice, all_params)
    128   results_per_device = async_results.get(999999)
    129   end_time = datetime.datetime.now()
    130   print '*' * 80
    131   print 'Finished sharding.'
    132   print 'Summary'
    133   total_time = 0
    134   for results in results_per_device:
    135     for result in results:
    136       print('%s : exit_code=%d in %d secs at %s' %
    137             (result['name'], result['exit_code'], result['total_time'],
    138              result['device']))
    139       total_time += result['total_time']
    140   print 'Step time: %d secs' % ((end_time - start_time).seconds)
    141   print 'Bots time: %d secs' % total_time
    142   # No exit_code for the sharding step: the individual _PrintResults step
    143   # will return the corresponding exit_code.
    144   return 0
    145 
    146 
    147 def _PrintStepOutput(step_name):
    148   file_name = os.path.join(_OUTPUT_DIR, step_name)
    149   if not os.path.exists(file_name):
    150     print 'File not found ', file_name
    151     return 1
    152   with file(file_name, 'r') as f:
    153     result = pickle.loads(f.read())
    154   print result['output']
    155   return result['exit_code']
    156 
    157 
    158 def _PrintAllStepsOutput(steps):
    159   with file(steps, 'r') as f:
    160     steps = json.load(f)
    161   ret = 0
    162   for step_name in steps.keys():
    163     ret |= _PrintStepOutput(step_name)
    164   return ret
    165 
    166 
    167 def _KillPendingServers():
    168   for retry in range(5):
    169     for server in ['lighttpd', 'web-page-replay']:
    170       pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server])
    171       pids = [pid.strip() for pid in pids.split('\n') if pid.strip()]
    172       for pid in pids:
    173         try:
    174           logging.warning('Killing %s %s', server, pid)
    175           os.kill(int(pid), signal.SIGQUIT)
    176         except Exception as e:
    177           logging.warning('Failed killing %s %s %s', server, pid, e)
    178   # Restart the adb server with taskset to set a single CPU affinity.
    179   cmd_helper.RunCmd(['adb', 'kill-server'])
    180   cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'start-server'])
    181   cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'root'])
    182   i = 1
    183   while not android_commands.GetAttachedDevices():
    184     time.sleep(i)
    185     i *= 2
    186     if i > 10:
    187       break
    188 
    189 
    190 def main(argv):
    191   parser = optparse.OptionParser()
    192   parser.add_option('-s', '--steps',
    193                     help='A JSON file containing all the steps to be '
    194                          'sharded.')
    195   parser.add_option('--flaky_steps',
    196                     help='A JSON file containing steps that are flaky and '
    197                          'will have its exit code ignored.')
    198   parser.add_option('-p', '--print_results',
    199                     help='Only prints the results for the previously '
    200                          'executed step, do not run it again.')
    201   parser.add_option('-P', '--print_all',
    202                     help='Only prints the results for the previously '
    203                          'executed steps, do not run them again.')
    204   options, urls = parser.parse_args(argv)
    205   if options.print_results:
    206     return _PrintStepOutput(options.print_results)
    207   if options.print_all:
    208     return _PrintAllStepsOutput(options.print_all)
    209 
    210   # At this point, we should kill everything that may have been left over from
    211   # previous runs.
    212   _KillPendingServers()
    213 
    214   forwarder.Forwarder.UseMultiprocessing()
    215 
    216   # Reset the test port allocation. It's important to do it before starting
    217   # to dispatch any step.
    218   if not ports.ResetTestServerPortAllocation():
    219     raise Exception('Failed to reset test server port.')
    220 
    221   # Sort the devices so that we'll try to always run a step in the same device.
    222   devices = sorted(android_commands.GetAttachedDevices())
    223   if not devices:
    224     print 'You must attach a device'
    225     return 1
    226 
    227   with file(options.steps, 'r') as f:
    228     steps = json.load(f)
    229   flaky_steps = []
    230   if options.flaky_steps:
    231     with file(options.flaky_steps, 'r') as f:
    232       flaky_steps = json.load(f)
    233   return _RunShardedSteps(steps, flaky_steps, devices)
    234 
    235 
    236 if __name__ == '__main__':
    237   sys.exit(main(sys.argv))
    238