1 #!/usr/bin/python 2 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """Orchestrate virtual machines to setup a toy instance of the lab for testing. 7 8 This module is meant to help create a closed loop development flow for members 9 of the lab team which looks something like this: 10 ______________ 11 | | 12 |gs vm resistry|<+ 13 |______________| | 14 | | 15 v | 16 New change -> puppylab -> New core_cluster box 17 | 18 Vagrantfile specifies cluster settings 19 _________________|____________________ 20 | | 21 | puppet provisions core_cluster box | 22 |______________________________________| 23 | | ........... | 24 v v v 25 master shard1 shardn 26 | | | | | | 27 mysql afe tko heartbt tko heartbt 28 | | | | | | 29 host ports 8001 8002 8001 8002 8001 8002 30 [host ports liable to autocorrect as needed] 31 32 This module can work with any vm hosting service/provider as long as they 33 adhere to the vagrant interface. VirtualBox is the only implementation so 34 far, though GCE will be an ideal candidate. 35 36 Class spec: 37 * VagrantProvisioner: Provision boxes per a VagrantFile. 38 * VirtualBoxProvisioner: Generate a Virtualbox VagrantFile. 39 * CoreVM: Manage individual core_cluster vms. 40 * ClusterManager: Spin up cluster. 41 42 Usage: clusterctl --admin-repo /usr/local/autotest/chromeos-internal 43 """ 44 45 import argparse 46 import logging 47 import os 48 import sys 49 50 import common 51 from autotest_lib.puppylab import lab_manifest 52 from autotest_lib.puppylab import vm_manager 53 from autotest_lib.site_utils.lib import infra 54 55 56 # TODO: Enable multiple shards via command line args. 57 NUM_SHARDS = 1 58 SHADOW_PATH = '/usr/local/autotest/shadow_config.ini' 59 60 61 class ConfigError(Exception): 62 """Raised if one of the vms in the cluster is misconfigured.""" 63 64 65 class CoreVM(object): 66 """Interface to create and manage a core_cluster vm image. 67 68 A core_cluster vm image has base packages shared by all server roles. 69 """ 70 _core_vm_name = 'chromeos_lab_core_cluster' 71 _core_image_source = 'gs://vms/%s.box' % _core_vm_name 72 _core_image_name = '%s.box' % _core_vm_name 73 _core_image_destination = os.path.join( 74 vm_manager.VAGRANT_DIR, _core_image_name) 75 76 # TODO: Preperation is currently by hand. Use the provisioner to 77 # create a box of name '_core_image_name', with the CoreClusterTemplate 78 # in the VAGRANT_DIR if you wish to prepare a new vm. You can achieve 79 # this by: 80 # * Copying the CoreClusterTemplate to a Vagrantfile and replacing the 81 # modulepath with the path to you chromeos-admin/puppet directory. 82 # * Calling `vagrant up` in the directory with this vagrant file. 83 # * When it's done, calling vagrant package. 84 # This should produce a package.box in the same dir. 85 86 def __init__(self, provisioner): 87 self.provisioner = provisioner 88 89 90 def setup_core_box(self): 91 """Setup a core cluster vm. 92 93 Download a core_cluster image if one isn't present on disk and 94 register it with vagrant. 95 """ 96 if not os.path.exists(self._core_image_destination): 97 infra.execute_command( 98 'localhost', 'gsutil cp %s %s' % 99 (self._core_image_source, self._core_image_destination)) 100 self.provisioner.register_box( 101 self._core_image_destination, self._core_vm_name) 102 103 104 def teardown_core_box(self): 105 """Teardown a core cluster vm.""" 106 # TODO: delete the box file. 107 self.provisioner.unregister_box(self._core_vm_name) 108 109 110 class ClusterManager(object): 111 """Interface to spin up a cluster of CoreVMs. 112 113 This class manages all the details between creating a core_cluster image 114 and running tests on a full fledged cluster. 115 """ 116 117 def _register_shards(self, num_shards): 118 """Register num_shards worth of shard info. 119 120 This includes the name, port address and board of the new shard. This 121 information is piped through to each vm, so the cluster manager is 122 actually in control of all the shards in the cluster and can address 123 them by name. 124 125 Consider a shard, shard1, assigned to board stumpy: 126 * You will be able to ssh into it with 'vagrant ssh stumpyshard'. 127 * The afe for the shard will be running on a incrementally 128 designated port starting from shards_base_port. 129 * The afe port of the shard is piped through to the shadow_config. 130 This is required for 2 reasons: 131 # `cli/atest shard add` should use this name, because it is 132 the name the shard-client will use to request jobs. 133 # the master afe should show links to the shard using this name. 134 135 @param num_shards: The number of shards we wish to add to the cluster. 136 """ 137 self.vagrantfile_shard_args = {} 138 self.shard_board_map = {} 139 self.vagrant_shard_names = [] 140 141 for num in range(1, num_shards+1): 142 # The name to use for vagrant ssh 143 shard_name = 'shard%s' % num 144 # The port for the shard's afe 145 shard_port = lab_manifest.shards_base_port + num 146 # The hostname to use in the shadow_config of the shard 147 shard_hostname = '%s:%s' % (lab_manifest.vm_host_name, shard_port) 148 149 self.vagrantfile_shard_args.update({ 150 shard_name: shard_name, 151 '%s_shadow_config_hostname' % shard_name: shard_hostname, 152 '%s_port' % shard_name: shard_port, 153 }) 154 if lab_manifest.shards: 155 board = lab_manifest.shards.pop() 156 # Assign a board to a shard. Use the shard_hostname as this 157 # settings is not meant to be human understandable. 158 self.shard_board_map[shard_hostname] = board 159 vagrant_shard_name = '%sshard' % board.rsplit(':')[-1] 160 # Replace the shard<int>-type-name with board_shard 161 self.vagrantfile_shard_args[shard_name] = vagrant_shard_name 162 self.vagrant_shard_names.append(vagrant_shard_name) 163 164 165 def __init__(self, vm_provisioner, vagrant_master_name='master', 166 num_shards=1): 167 """Initialize parameters for the cluster. 168 169 @param vm_provisioner: A provisioner object, currently the only one 170 supported is VirtualBox. 171 @param master_name: The name to give the cluster master. 172 @param num_shards: The number of shards in the cluster. Each shard 173 gets a name allocated based on its number (eg: shard1). 174 """ 175 self.provisioner = vm_provisioner 176 self.vm_manager = CoreVM(provisioner=self.provisioner) 177 self._register_shards(num_shards) 178 self.vagrant_master_name = vagrant_master_name 179 180 181 def start_cluster(self): 182 """Start a cluster.""" 183 self.vm_manager.setup_core_box() 184 185 # TODO: Add a --rebuild-cluster option. 186 needs_destroy = self.provisioner.initialize_vagrant( 187 master=self.vagrant_master_name, 188 master_port=lab_manifest.master_afe_port, 189 **self.vagrantfile_shard_args) 190 self.provisioner.provision(needs_destroy) 191 192 193 def shutdown_cluster(self): 194 """Shutdown the current cluster.""" 195 # TODO: Actually destroy. Halt is useful for debugging. 196 self.provisioner.vagrant_cmd('halt') 197 198 199 def execute_against_vm(self, vm_name, cmd): 200 """Execute cmd against vm_name. 201 202 @param cmd: The command to execute. 203 @param vm_name: The name of the vm, eg: stumpyshard. 204 """ 205 return self.provisioner.vagrant_cmd( 206 "ssh %s -- '%s'" % (vm_name, cmd)).rstrip('\n') 207 208 209 def _get_shadow_config_value(self, vm_name, key): 210 cmd = 'grep "^%s:" %s' % (key, SHADOW_PATH) 211 shadow_value = self.execute_against_vm(vm_name, cmd) 212 return shadow_value.rsplit(':')[-1].lstrip(' ') 213 214 215 def _check_shadow_config(self, vm, key, expected_value): 216 """Sanity check the shadow_configs of all vms in the cluster. 217 218 @raises ConfigError: If a shadow_config is misconfigured. 219 """ 220 value = self._get_shadow_config_value(vm, key) 221 if value != expected_value: 222 raise ConfigError( 223 '%s vm has misconfigued config %s = %s, expected %s' % 224 (vm, key, value, expected_value)) 225 logging.info('%s has %s = %s', vm, key, value) 226 227 228 def _upstart_cmd(self, vm, job_name, cmd='status'): 229 """Execute an upstart command. 230 231 @param vm: The name of the vm to execute it against. 232 @param job_name: The name of the upstart job. 233 @param cmd: The upstart command. 234 235 @return: The output of the upstart command. 236 """ 237 status_cmd = 'sudo %s %s' % (cmd, job_name) 238 try: 239 return self.execute_against_vm(vm, status_cmd) 240 except vm_manager.VagrantCmdError as e: 241 return '%s service not found on %s' % (job_name, vm) 242 243 244 def check_services(self, action='start'): 245 """Get the status of all core services on the vms. 246 247 This method is designed to start srevices on the master/all 248 shards if their shadow configs are as expected. If the shadow 249 config option on a vm has an unexpected setting, services 250 are not started on it. 251 252 @param action: The action to perform on servcies. Start will 253 start all of them, stop will stop them all. 254 255 @raises ConfigError: If a shadow_config option is unexpected. 256 """ 257 core_services = set( 258 ['scheduler', 'host-scheduler', 259 'gs_offloader', 'gs_offloader_s', 'shard-client']) 260 gateway = self.execute_against_vm( 261 self.vagrant_master_name, 262 "netstat -rn | grep \"^0.0.0.0 \" | cut -d \" \" -f10 | head -1" 263 ).rstrip('\n') 264 265 for vm in self.vagrant_shard_names + [self.vagrant_master_name]: 266 vm_manager.format_msg('Checking services on %s' % vm) 267 self._check_shadow_config(vm, 'host', 'localhost') 268 global_db = ('localhost' if vm == self.vagrant_master_name 269 else gateway) 270 self._check_shadow_config(vm, 'global_db_host', global_db) 271 272 for service in core_services: 273 logging.info('Checking %s on %s', service, vm) 274 status = self._upstart_cmd(vm, service, action) 275 logging.info(status) 276 277 278 def bringup_cluster(admin_repo, num_shards=NUM_SHARDS, start_safe=False): 279 """Start a cluster. 280 281 @param admin_repo: Path to the chromeos-admin repo. 282 @param num_shards: Number of shards. You cannot change 283 the number of shards on a running cluster, you need 284 to destroy the cluster, remove the vagrant file, 285 modify the ClusterTemplate to include a new section 286 for the additional shard, and rerun clusterctl. 287 @param start_safe: Start the cluster in safe mode. This means 288 all core services will be stopped. 289 """ 290 puppet_path = os.path.join(admin_repo, 'puppet') 291 if not os.path.exists(puppet_path): 292 raise ValueError('Admin repo %s does not contain puppet module' % 293 admin_repo) 294 cluster_manager = ClusterManager( 295 vm_provisioner=vm_manager.VirtualBox(puppet_path=puppet_path), 296 vagrant_master_name='master', num_shards=num_shards) 297 cluster_manager.start_cluster() 298 try: 299 cluster_manager.check_services(action='stop' if start_safe else 'start') 300 except ConfigError as e: 301 logging.error( 302 'Shutting down cluster: %s', e) 303 cluster_manager.shutdown_cluster() 304 return 1 305 306 307 def sync(): 308 """Sync autotest from the host to all vms in the cluster.""" 309 vm_manager.format_msg('Syncing Cluster') 310 vm_manager.VagrantProvisioner.vagrant_cmd('rsync', stream_output=True) 311 vm_manager.VagrantProvisioner.vagrant_cmd( 312 'provision --provision-with shell', stream_output=True) 313 vm_manager.format_msg('Please restart services as required') 314 315 316 def _parse_args(args): 317 """Parse command line arguments. 318 319 @param args: A list of command line arguments, eg sys.argv[1:] 320 321 @return: A tuple with the parsed args, as returned by parser.parse_args. 322 """ 323 if not args: 324 print ('Too few arguments, try clusterctl --help') 325 sys.exit(1) 326 327 description = ('A script to orchestrate a toy test lab. Provided ' 328 'with a path to the internal repo it will download a ' 329 'vm image and spin up a cluster against which you can ' 330 'test core autotest changes without DUTs.') 331 parser = argparse.ArgumentParser(description=description) 332 subparsers = parser.add_subparsers() 333 provision_subparser = subparsers.add_parser( 334 'provision', help='provision a cluster') 335 provision_subparser.required = False 336 provision_subparser.set_defaults(which='provision') 337 provision_subparser.add_argument( 338 '--admin-repo', dest='admin_repo', type=str, 339 help=('Path to the admin repo that has puppet scripts used for ' 340 'provisioning the cluster. If you do not already have it you ' 341 'can git clone the chromeos/chromeos-admin repo.')) 342 provision_subparser.add_argument( 343 '--safe', dest='start_safe', action='store_true', 344 help='If sepcified services will not be started automatically.') 345 346 # TODO: Automate restart of services via a --restart option. 347 update_subparser = subparsers.add_parser('update', help='Update a cluster') 348 update_subparser.required = False 349 update_subparser.set_defaults(which='update') 350 update_subparser.add_argument( 351 '--sync', dest='sync', action='store_true', 352 help='Sync autotest from host to all vms in cluster.') 353 return parser.parse_args(args) 354 355 356 def main(args): 357 """Main function. 358 359 @param args: command line arguments for the script. 360 """ 361 args = _parse_args(args) 362 if args.which == 'update' and args.sync: 363 sync() 364 else: 365 bringup_cluster( 366 admin_repo=args.admin_repo, start_safe=args.start_safe) 367 368 369 if __name__ == '__main__': 370 sys.exit(main(sys.argv[1:])) 371