Home | History | Annotate | Download | only in puppylab
      1 #!/usr/bin/python
      2 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Orchestrate virtual machines to setup a toy instance of the lab for testing.
      7 
      8 This module is meant to help create a closed loop development flow for members
      9 of the lab team which looks something like this:
     10                     ______________
     11                    |              |
     12                    |gs vm resistry|<+
     13                    |______________| |
     14                           |         |
     15                           v         |
     16         New change -> puppylab -> New core_cluster box
     17                           |
     18          Vagrantfile specifies cluster settings
     19          _________________|____________________
     20         |                                      |
     21         |  puppet provisions core_cluster box  |
     22         |______________________________________|
     23                 |          | ........... |
     24                 v          v             v
     25               master     shard1       shardn
     26              |     |     |     |      |     |
     27             mysql  afe  tko heartbt   tko heartbt
     28              |     |     |     |      |     |
     29 host ports  8001  8002  8001  8002    8001  8002
     30         [host ports liable to autocorrect as needed]
     31 
     32 This module can work with any vm hosting service/provider as long as they
     33 adhere to the vagrant interface. VirtualBox is the only implementation so
     34 far, though GCE will be an ideal candidate.
     35 
     36 Class spec:
     37 * VagrantProvisioner: Provision boxes per a VagrantFile.
     38     * VirtualBoxProvisioner: Generate a Virtualbox VagrantFile.
     39 * CoreVM: Manage individual core_cluster vms.
     40 * ClusterManager: Spin up cluster.
     41 
     42 Usage: clusterctl --admin-repo /usr/local/autotest/chromeos-internal
     43 """
     44 
     45 import argparse
     46 import logging
     47 import os
     48 import sys
     49 
     50 import common
     51 from autotest_lib.puppylab import lab_manifest
     52 from autotest_lib.puppylab import vm_manager
     53 from autotest_lib.site_utils.lib import infra
     54 
     55 
     56 # TODO: Enable multiple shards via command line args.
     57 NUM_SHARDS = 1
     58 SHADOW_PATH = '/usr/local/autotest/shadow_config.ini'
     59 
     60 
     61 class ConfigError(Exception):
     62     """Raised if one of the vms in the cluster is misconfigured."""
     63 
     64 
     65 class CoreVM(object):
     66     """Interface to create and manage a core_cluster vm image.
     67 
     68     A core_cluster vm image has base packages shared by all server roles.
     69     """
     70     _core_vm_name = 'chromeos_lab_core_cluster'
     71     _core_image_source = 'gs://vms/%s.box' % _core_vm_name
     72     _core_image_name = '%s.box' % _core_vm_name
     73     _core_image_destination = os.path.join(
     74             vm_manager.VAGRANT_DIR, _core_image_name)
     75 
     76     # TODO: Preperation is currently by hand. Use the provisioner to
     77     # create a box of name '_core_image_name', with the CoreClusterTemplate
     78     # in the VAGRANT_DIR if you wish to prepare a new vm. You can achieve
     79     # this by:
     80     # * Copying the CoreClusterTemplate to a Vagrantfile and replacing the
     81     #   modulepath with the path to you chromeos-admin/puppet directory.
     82     # * Calling `vagrant up` in the directory with this vagrant file.
     83     # * When it's done, calling vagrant package.
     84     # This should produce a package.box in the same dir.
     85 
     86     def __init__(self, provisioner):
     87         self.provisioner = provisioner
     88 
     89 
     90     def setup_core_box(self):
     91         """Setup a core cluster vm.
     92 
     93         Download a core_cluster image if one isn't present on disk and
     94         register it with vagrant.
     95         """
     96         if not os.path.exists(self._core_image_destination):
     97             infra.execute_command(
     98                 'localhost', 'gsutil cp %s %s' %
     99                 (self._core_image_source, self._core_image_destination))
    100         self.provisioner.register_box(
    101                 self._core_image_destination, self._core_vm_name)
    102 
    103 
    104     def teardown_core_box(self):
    105         """Teardown a core cluster vm."""
    106         # TODO: delete the box file.
    107         self.provisioner.unregister_box(self._core_vm_name)
    108 
    109 
    110 class ClusterManager(object):
    111     """Interface to spin up a cluster of CoreVMs.
    112 
    113     This class manages all the details between creating a core_cluster image
    114     and running tests on a full fledged cluster.
    115     """
    116 
    117     def _register_shards(self, num_shards):
    118         """Register num_shards worth of shard info.
    119 
    120         This includes the name, port address and board of the new shard. This
    121         information is piped through to each vm, so the cluster manager is
    122         actually in control of all the shards in the cluster and can address
    123         them by name.
    124 
    125         Consider a shard, shard1, assigned to board stumpy:
    126             * You will be able to ssh into it with 'vagrant ssh stumpyshard'.
    127             * The afe for the shard will be running on a incrementally
    128               designated port starting from shards_base_port.
    129             * The afe port of the shard is piped through to the shadow_config.
    130               This is required for 2 reasons:
    131                 # `cli/atest shard add` should use this name, because it is
    132                   the name the shard-client will use to request jobs.
    133                 # the master afe should show links to the shard using this name.
    134 
    135         @param num_shards: The number of shards we wish to add to the cluster.
    136         """
    137         self.vagrantfile_shard_args = {}
    138         self.shard_board_map = {}
    139         self.vagrant_shard_names = []
    140 
    141         for num in range(1, num_shards+1):
    142             # The name to use for vagrant ssh
    143             shard_name = 'shard%s' % num
    144             # The port for the shard's afe
    145             shard_port = lab_manifest.shards_base_port + num
    146             # The hostname to use in the shadow_config of the shard
    147             shard_hostname = '%s:%s' % (lab_manifest.vm_host_name, shard_port)
    148 
    149             self.vagrantfile_shard_args.update({
    150                 shard_name: shard_name,
    151                 '%s_shadow_config_hostname' % shard_name: shard_hostname,
    152                 '%s_port' % shard_name: shard_port,
    153             })
    154             if lab_manifest.shards:
    155                 board = lab_manifest.shards.pop()
    156                 # Assign a board to a shard. Use the shard_hostname as this
    157                 # settings is not meant to be human understandable.
    158                 self.shard_board_map[shard_hostname] = board
    159                 vagrant_shard_name = '%sshard' % board.rsplit(':')[-1]
    160                 # Replace the shard<int>-type-name with board_shard
    161                 self.vagrantfile_shard_args[shard_name] = vagrant_shard_name
    162                 self.vagrant_shard_names.append(vagrant_shard_name)
    163 
    164 
    165     def __init__(self, vm_provisioner, vagrant_master_name='master',
    166                  num_shards=1):
    167         """Initialize parameters for the cluster.
    168 
    169         @param vm_provisioner: A provisioner object, currently the only one
    170             supported is VirtualBox.
    171         @param master_name: The name to give the cluster master.
    172         @param num_shards: The number of shards in the cluster. Each shard
    173             gets a name allocated based on its number (eg: shard1).
    174         """
    175         self.provisioner = vm_provisioner
    176         self.vm_manager = CoreVM(provisioner=self.provisioner)
    177         self._register_shards(num_shards)
    178         self.vagrant_master_name = vagrant_master_name
    179 
    180 
    181     def start_cluster(self):
    182         """Start a cluster."""
    183         self.vm_manager.setup_core_box()
    184 
    185         # TODO: Add a --rebuild-cluster option.
    186         needs_destroy = self.provisioner.initialize_vagrant(
    187                 master=self.vagrant_master_name,
    188                 master_port=lab_manifest.master_afe_port,
    189                 **self.vagrantfile_shard_args)
    190         self.provisioner.provision(needs_destroy)
    191 
    192 
    193     def shutdown_cluster(self):
    194         """Shutdown the current cluster."""
    195         # TODO: Actually destroy. Halt is useful for debugging.
    196         self.provisioner.vagrant_cmd('halt')
    197 
    198 
    199     def execute_against_vm(self, vm_name, cmd):
    200         """Execute cmd against vm_name.
    201 
    202         @param cmd: The command to execute.
    203         @param vm_name: The name of the vm, eg: stumpyshard.
    204         """
    205         return self.provisioner.vagrant_cmd(
    206                 "ssh %s -- '%s'" % (vm_name, cmd)).rstrip('\n')
    207 
    208 
    209     def _get_shadow_config_value(self, vm_name, key):
    210         cmd = 'grep "^%s:" %s' % (key, SHADOW_PATH)
    211         shadow_value = self.execute_against_vm(vm_name, cmd)
    212         return shadow_value.rsplit(':')[-1].lstrip(' ')
    213 
    214 
    215     def _check_shadow_config(self, vm, key, expected_value):
    216         """Sanity check the shadow_configs of all vms in the cluster.
    217 
    218         @raises ConfigError: If a shadow_config is misconfigured.
    219         """
    220         value = self._get_shadow_config_value(vm, key)
    221         if value != expected_value:
    222             raise ConfigError(
    223                     '%s vm has misconfigued config %s = %s, expected %s' %
    224                     (vm, key, value, expected_value))
    225         logging.info('%s has %s = %s', vm, key, value)
    226 
    227 
    228     def _upstart_cmd(self, vm, job_name, cmd='status'):
    229         """Execute an upstart command.
    230 
    231         @param vm: The name of the vm to execute it against.
    232         @param job_name: The name of the upstart job.
    233         @param cmd: The upstart command.
    234 
    235         @return: The output of the upstart command.
    236         """
    237         status_cmd = 'sudo %s %s' % (cmd, job_name)
    238         try:
    239             return self.execute_against_vm(vm, status_cmd)
    240         except vm_manager.VagrantCmdError as e:
    241             return '%s service not found on %s' % (job_name, vm)
    242 
    243 
    244     def check_services(self, action='start'):
    245         """Get the status of all core services on the vms.
    246 
    247         This method is designed to start srevices on the master/all
    248         shards if their shadow configs are as expected. If the shadow
    249         config option on a vm has an unexpected setting, services
    250         are not started on it.
    251 
    252         @param action: The action to perform on servcies. Start will
    253             start all of them, stop will stop them all.
    254 
    255         @raises ConfigError: If a shadow_config option is unexpected.
    256         """
    257         core_services = set(
    258                 ['scheduler', 'host-scheduler',
    259                  'gs_offloader', 'gs_offloader_s', 'shard-client'])
    260         gateway = self.execute_against_vm(
    261                 self.vagrant_master_name,
    262                 "netstat -rn | grep \"^0.0.0.0 \" | cut -d \" \" -f10 | head -1"
    263                 ).rstrip('\n')
    264 
    265         for vm in self.vagrant_shard_names + [self.vagrant_master_name]:
    266             vm_manager.format_msg('Checking services on %s' % vm)
    267             self._check_shadow_config(vm, 'host', 'localhost')
    268             global_db = ('localhost' if vm == self.vagrant_master_name
    269                          else gateway)
    270             self._check_shadow_config(vm, 'global_db_host', global_db)
    271 
    272             for service in core_services:
    273                 logging.info('Checking %s on %s', service, vm)
    274                 status = self._upstart_cmd(vm, service, action)
    275                 logging.info(status)
    276 
    277 
    278 def bringup_cluster(admin_repo, num_shards=NUM_SHARDS, start_safe=False):
    279     """Start a cluster.
    280 
    281     @param admin_repo: Path to the chromeos-admin repo.
    282     @param num_shards: Number of shards. You cannot change
    283         the number of shards on a running cluster, you need
    284         to destroy the cluster, remove the vagrant file,
    285         modify the ClusterTemplate to include a new section
    286         for the additional shard, and rerun clusterctl.
    287     @param start_safe: Start the cluster in safe mode. This means
    288         all core services will be stopped.
    289     """
    290     puppet_path = os.path.join(admin_repo, 'puppet')
    291     if not os.path.exists(puppet_path):
    292         raise ValueError('Admin repo %s does not contain puppet module' %
    293                          admin_repo)
    294     cluster_manager = ClusterManager(
    295             vm_provisioner=vm_manager.VirtualBox(puppet_path=puppet_path),
    296             vagrant_master_name='master', num_shards=num_shards)
    297     cluster_manager.start_cluster()
    298     try:
    299         cluster_manager.check_services(action='stop' if start_safe else 'start')
    300     except ConfigError as e:
    301         logging.error(
    302                 'Shutting down cluster: %s', e)
    303         cluster_manager.shutdown_cluster()
    304         return 1
    305 
    306 
    307 def sync():
    308     """Sync autotest from the host to all vms in the cluster."""
    309     vm_manager.format_msg('Syncing Cluster')
    310     vm_manager.VagrantProvisioner.vagrant_cmd('rsync', stream_output=True)
    311     vm_manager.VagrantProvisioner.vagrant_cmd(
    312             'provision --provision-with shell', stream_output=True)
    313     vm_manager.format_msg('Please restart services as required')
    314 
    315 
    316 def _parse_args(args):
    317     """Parse command line arguments.
    318 
    319     @param args: A list of command line arguments, eg sys.argv[1:]
    320 
    321     @return: A tuple with the parsed args, as returned by parser.parse_args.
    322     """
    323     if not args:
    324         print ('Too few arguments, try clusterctl --help')
    325         sys.exit(1)
    326 
    327     description = ('A script to orchestrate a toy test lab. Provided '
    328                    'with a path to the internal repo it will download a '
    329                    'vm image and spin up a cluster against which you can '
    330                    'test core autotest changes without DUTs.')
    331     parser = argparse.ArgumentParser(description=description)
    332     subparsers = parser.add_subparsers()
    333     provision_subparser = subparsers.add_parser(
    334             'provision', help='provision a cluster')
    335     provision_subparser.required = False
    336     provision_subparser.set_defaults(which='provision')
    337     provision_subparser.add_argument(
    338             '--admin-repo', dest='admin_repo', type=str,
    339             help=('Path to the admin repo that has puppet scripts used for '
    340                   'provisioning the cluster. If you do not already have it you '
    341                   'can git clone the chromeos/chromeos-admin repo.'))
    342     provision_subparser.add_argument(
    343             '--safe', dest='start_safe', action='store_true',
    344             help='If sepcified services will not be started automatically.')
    345 
    346     # TODO: Automate restart of services via a --restart option.
    347     update_subparser = subparsers.add_parser('update', help='Update a cluster')
    348     update_subparser.required = False
    349     update_subparser.set_defaults(which='update')
    350     update_subparser.add_argument(
    351             '--sync', dest='sync', action='store_true',
    352             help='Sync autotest from host to all vms in cluster.')
    353     return parser.parse_args(args)
    354 
    355 
    356 def main(args):
    357     """Main function.
    358 
    359     @param args: command line arguments for the script.
    360     """
    361     args = _parse_args(args)
    362     if args.which == 'update' and args.sync:
    363         sync()
    364     else:
    365         bringup_cluster(
    366                 admin_repo=args.admin_repo, start_safe=args.start_safe)
    367 
    368 
    369 if __name__ == '__main__':
    370     sys.exit(main(sys.argv[1:]))
    371