Home | History | Annotate | Download | only in toolchain-utils
      1 #!/usr/bin/env python2
      2 #
      3 # Copyright 2015 Google INc.  All Rights Reserved.
      4 """This module controls locking and unlocking of test machines."""
      5 
      6 from __future__ import print_function
      7 
      8 import argparse
      9 import getpass
     10 import os
     11 import sys
     12 import traceback
     13 
     14 from cros_utils import logger
     15 from cros_utils import machines
     16 
     17 
     18 class AFELockException(Exception):
     19   """Base class for exceptions in this module."""
     20 
     21 
     22 class MachineNotPingable(AFELockException):
     23   """Raised when machine does not respond to ping."""
     24 
     25 
     26 class MissingHostInfo(AFELockException):
     27   """Raised when cannot find info about machine on machine servers."""
     28 
     29 
     30 class UpdateNonLocalMachine(AFELockException):
     31   """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
     32 
     33 
     34 class DuplicateAdd(AFELockException):
     35   """Raised when user requests to add a machine that's already on the server."""
     36 
     37 
     38 class UpdateServerError(AFELockException):
     39   """Raised when attempt to add/remove a machine from local server fails."""
     40 
     41 
     42 class LockingError(AFELockException):
     43   """Raised when server fails to lock/unlock machine as requested."""
     44 
     45 
     46 class DontOwnLock(AFELockException):
     47   """Raised when user attmepts to unlock machine locked by someone else."""
     48   # This should not be raised if the user specified '--force'
     49 
     50 
     51 class NoAFEServer(AFELockException):
     52   """Raised when cannot find/access the autotest server."""
     53 
     54 
     55 class AFEAccessError(AFELockException):
     56   """Raised when cannot get information about lab machine from lab server."""
     57 
     58 
     59 class AFELockManager(object):
     60   """Class for locking/unlocking machines vie Autotest Front End servers.
     61 
     62   This class contains methods for checking the locked status of machines
     63   on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
     64   has methods for adding/removing machines from the local server, and for
     65   changing the lock status of machines on either server.  For the ChromeOS
     66   HW Lab, it only allows access to the toolchain team lab machines, as
     67   defined in toolchain-utils/crosperf/default_remotes.  By default it will
     68   look for a local server on chrotomation2.svl.corp.google.com, but an
     69   alternative local AFE server can be supplied, if desired.
     70 
     71   !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
     72   thread/process of a program.  If you launch threads and try to call it
     73   from a thread, you will get an error.  This has to do with restrictions
     74   in the Python virtual machine (and signal handling) and cannot be changed.
     75   """
     76 
     77   LOCAL_SERVER = 'chrotomation2.svl.corp.google.com'
     78 
     79   def __init__(self,
     80                remotes,
     81                force_option,
     82                chromeos_root,
     83                local_server,
     84                use_local=True,
     85                log=None):
     86     """Initializes an AFELockManager object.
     87 
     88     Args:
     89       remotes: A list of machine names or ip addresses to be managed.  Names
     90         and ip addresses should be represented as strings.  If the list is
     91         empty, the lock manager will get all known machines.
     92       force_option: A Boolean indicating whether or not to force an unlock of
     93         a machine that was locked by someone else.
     94       chromeos_root: The ChromeOS chroot to use for the autotest scripts.
     95       local_server: A string containing the name or ip address of the machine
     96         that is running an AFE server, which is to be used for managing
     97         machines that are not in the ChromeOS HW lab.
     98       local: A Boolean indicating whether or not to use/allow a local AFE
     99         server to be used (see local_server argument).
    100       use_local: Use the local server instead of the official one.
    101       log: If not None, this is the logger object to be used for writing out
    102         informational output messages.  It is expected to be an instance of
    103         Logger class from cros_utils/logger.py.
    104     """
    105     self.chromeos_root = chromeos_root
    106     self.user = getpass.getuser()
    107     self.logger = log or logger.GetLogger()
    108     autotest_path = os.path.join(chromeos_root,
    109                                  'src/third_party/autotest/files')
    110 
    111     sys.path.append(chromeos_root)
    112     sys.path.append(autotest_path)
    113     sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
    114 
    115     # We have to wait to do these imports until the paths above have
    116     # been fixed.
    117     # pylint: disable=import-error
    118     from client import setup_modules
    119     setup_modules.setup(
    120         base_path=autotest_path, root_module_name='autotest_lib')
    121 
    122     from dynamic_suite import frontend_wrappers
    123 
    124     self.afe = frontend_wrappers.RetryingAFE(
    125         timeout_min=30, delay_sec=10, debug=False, server='cautotest')
    126 
    127     self.local = use_local
    128     self.machines = list(set(remotes)) or []
    129     self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
    130     if self.machines and self.AllLabMachines():
    131       self.local = False
    132 
    133     if not self.local:
    134       self.local_afe = None
    135     else:
    136       dargs = {}
    137       dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
    138       # Make sure local server is pingable.
    139       error_msg = ('Local autotest server machine %s not responding to ping.' %
    140                    dargs['server'])
    141       self.CheckMachine(dargs['server'], error_msg)
    142       self.local_afe = frontend_wrappers.RetryingAFE(
    143           timeout_min=30, delay_sec=10, debug=False, **dargs)
    144     if not self.machines:
    145       self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
    146     self.force = force_option
    147 
    148   def AllLabMachines(self):
    149     """Check to see if all machines being used are HW Lab machines."""
    150     all_lab = True
    151     for m in self.machines:
    152       if m not in self.toolchain_lab_machines:
    153         all_lab = False
    154         break
    155     return all_lab
    156 
    157   def CheckMachine(self, machine, error_msg):
    158     """Verifies that machine is responding to ping.
    159 
    160     Args:
    161       machine: String containing the name or ip address of machine to check.
    162       error_msg: Message to print if ping fails.
    163 
    164     Raises:
    165       MachineNotPingable:  If machine is not responding to 'ping'
    166     """
    167     if not machines.MachineIsPingable(machine, logging_level='none'):
    168       cros_machine = machine + '.cros'
    169       if not machines.MachineIsPingable(cros_machine, logging_level='none'):
    170         raise MachineNotPingable(error_msg)
    171 
    172   def MachineIsKnown(self, machine):
    173     """Checks to see if either AFE server knows the given machine.
    174 
    175     Args:
    176       machine: String containing name or ip address of machine to check.
    177 
    178     Returns:
    179       Boolean indicating if the machine is in the list of known machines for
    180         either AFE server.
    181     """
    182     if machine in self.toolchain_lab_machines:
    183       return True
    184     elif self.local_afe and machine in self.GetAllNonlabMachines():
    185       return True
    186 
    187     return False
    188 
    189   def GetAllToolchainLabMachines(self):
    190     """Gets a list of all the toolchain machines in the ChromeOS HW lab.
    191 
    192     Returns:
    193       A list of names of the toolchain machines in the ChromeOS HW lab.
    194     """
    195     machines_file = os.path.join(
    196         os.path.dirname(__file__), 'crosperf', 'default_remotes')
    197     machine_list = []
    198     with open(machines_file, 'r') as input_file:
    199       lines = input_file.readlines()
    200       for line in lines:
    201         _, remotes = line.split(':')
    202         remotes = remotes.strip()
    203         for r in remotes.split():
    204           machine_list.append(r.strip())
    205     return machine_list
    206 
    207   def GetAllNonlabMachines(self):
    208     """Gets a list of all known machines on the local AFE server.
    209 
    210     Returns:
    211       A list of the names of the machines on the local AFE server.
    212     """
    213     non_lab_machines = []
    214     if self.local_afe:
    215       non_lab_machines = self.local_afe.get_hostnames()
    216     return non_lab_machines
    217 
    218   def PrintStatusHeader(self, is_lab_machine):
    219     """Prints the status header lines for machines.
    220 
    221     Args:
    222       is_lab_machine: Boolean indicating whether to print HW Lab header or
    223         local machine header (different spacing).
    224     """
    225     if is_lab_machine:
    226       print('\nMachine (Board)\t\t\t\t\tStatus')
    227       print('---------------\t\t\t\t\t------\n')
    228     else:
    229       print('\nMachine (Board)\t\tStatus')
    230       print('---------------\t\t------\n')
    231 
    232   def RemoveLocalMachine(self, m):
    233     """Removes a machine from the local AFE server.
    234 
    235     Args:
    236       m: The machine to remove.
    237 
    238     Raises:
    239       MissingHostInfo:  Can't find machine to be removed.
    240     """
    241     if self.local_afe:
    242       host_info = self.local_afe.get_hosts(hostname=m)
    243       if host_info:
    244         host_info = host_info[0]
    245         host_info.delete()
    246       else:
    247         raise MissingHostInfo('Cannot find/delete machine %s.' % m)
    248 
    249   def AddLocalMachine(self, m):
    250     """Adds a machine to the local AFE server.
    251 
    252     Args:
    253       m: The machine to be added.
    254     """
    255     if self.local_afe:
    256       error_msg = 'Machine %s is not responding to ping.' % m
    257       self.CheckMachine(m, error_msg)
    258       self.local_afe.create_host(m)
    259 
    260   def AddMachinesToLocalServer(self):
    261     """Adds one or more machines to the local AFE server.
    262 
    263     Verify that the requested machines are legal to add to the local server,
    264     i.e. that they are not ChromeOS HW lab machines, and they are not already
    265     on the local server.  Call AddLocalMachine for each valid machine.
    266 
    267     Raises:
    268       DuplicateAdd: Attempt to add a machine that is already on the server.
    269       UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
    270       UpdateServerError:  Something went wrong while attempting to add a
    271         machine.
    272     """
    273     for m in self.machines:
    274       for cros_name in [m, m + '.cros']:
    275         if cros_name in self.toolchain_lab_machines:
    276           raise UpdateNonLocalMachine(
    277               'Machine %s is already in the ChromeOS HW'
    278               'Lab.  Cannot add it to local server.' % cros_name)
    279       host_info = self.local_afe.get_hosts(hostname=m)
    280       if host_info:
    281         raise DuplicateAdd('Machine %s is already on the local server.' % m)
    282       try:
    283         self.AddLocalMachine(m)
    284         self.logger.LogOutput('Successfully added %s to local server.' % m)
    285       except Exception as e:
    286         traceback.print_exc()
    287         raise UpdateServerError(
    288             'Error occurred while attempting to add %s. %s' % (m, str(e)))
    289 
    290   def RemoveMachinesFromLocalServer(self):
    291     """Removes one or more machines from the local AFE server.
    292 
    293     Verify that the requested machines are legal to remove from the local
    294     server, i.e. that they are not ChromeOS HW lab machines.  Call
    295     RemoveLocalMachine for each valid machine.
    296 
    297     Raises:
    298       UpdateServerError:  Something went wrong while attempting to remove a
    299         machine.
    300     """
    301     for m in self.machines:
    302       for cros_name in [m, m + '.cros']:
    303         if cros_name in self.toolchain_lab_machines:
    304           raise UpdateNonLocalMachine(
    305               'Machine %s is in the ChromeOS HW Lab. '
    306               'This script cannot remove lab machines.' % cros_name)
    307       try:
    308         self.RemoveLocalMachine(m)
    309         self.logger.LogOutput('Successfully removed %s from local server.' % m)
    310       except Exception as e:
    311         traceback.print_exc()
    312         raise UpdateServerError('Error occurred while attempting to remove %s '
    313                                 '(%s).' % (m, str(e)))
    314 
    315   def ListMachineStates(self, machine_states):
    316     """Gets and prints the current status for a list of machines.
    317 
    318     Prints out the current status for all of the machines in the current
    319     AFELockManager's list of machines (set when the object is initialized).
    320 
    321     Args:
    322       machine_states: A dictionary of the current state of every machine in
    323         the current AFELockManager's list of machines.  Normally obtained by
    324         calling AFELockManager::GetMachineStates.
    325     """
    326     local_machines = []
    327     printed_hdr = False
    328     for m in machine_states:
    329       cros_name = m + '.cros'
    330       if (m in self.toolchain_lab_machines or
    331           cros_name in self.toolchain_lab_machines):
    332         name = m if m in self.toolchain_lab_machines else cros_name
    333         if not printed_hdr:
    334           self.PrintStatusHeader(True)
    335           printed_hdr = True
    336         state = machine_states[m]
    337         if state['locked']:
    338           print('%s (%s)\tlocked by %s since %s' %
    339                 (name, state['board'], state['locked_by'], state['lock_time']))
    340         else:
    341           print('%s (%s)\tunlocked' % (name, state['board']))
    342       else:
    343         local_machines.append(m)
    344 
    345     if local_machines:
    346       self.PrintStatusHeader(False)
    347       for m in local_machines:
    348         state = machine_states[m]
    349         if state['locked']:
    350           print('%s (%s)\tlocked by %s since %s' %
    351                 (m, state['board'], state['locked_by'], state['lock_time']))
    352         else:
    353           print('%s (%s)\tunlocked' % (m, state['board']))
    354 
    355   def UpdateLockInAFE(self, should_lock_machine, machine):
    356     """Calls an AFE server to lock/unlock a machine.
    357 
    358     Args:
    359       should_lock_machine: Boolean indicating whether to lock the machine (True)
    360         or unlock the machine (False).
    361       machine: The machine to update.
    362 
    363     Raises:
    364       LockingError:  An error occurred while attempting to update the machine
    365         state.
    366     """
    367     action = 'lock'
    368     if not should_lock_machine:
    369       action = 'unlock'
    370     kwargs = {'locked': should_lock_machine}
    371     kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
    372 
    373     cros_name = machine + '.cros'
    374     if cros_name in self.toolchain_lab_machines:
    375       machine = cros_name
    376     if machine in self.toolchain_lab_machines:
    377       m = machine.split('.')[0]
    378       afe_server = self.afe
    379     else:
    380       m = machine
    381       afe_server = self.local_afe
    382 
    383     try:
    384       afe_server.run(
    385           'modify_hosts',
    386           host_filter_data={'hostname__in': [m]},
    387           update_data=kwargs)
    388     except Exception as e:
    389       traceback.print_exc()
    390       raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
    391 
    392   def UpdateMachines(self, lock_machines):
    393     """Sets the locked state of the machines to the requested value.
    394 
    395     The machines updated are the ones in self.machines (specified when the
    396     class object was intialized).
    397 
    398     Args:
    399       lock_machines: Boolean indicating whether to lock the machines (True) or
    400         unlock the machines (False).
    401 
    402     Returns:
    403       A list of the machines whose state was successfully updated.
    404     """
    405     updated_machines = []
    406     for m in self.machines:
    407       self.UpdateLockInAFE(lock_machines, m)
    408       # Since we returned from self.UpdateLockInAFE we assume the request
    409       # succeeded.
    410       if lock_machines:
    411         self.logger.LogOutput('Locked machine(s) %s.' % m)
    412       else:
    413         self.logger.LogOutput('Unlocked machine(s) %s.' % m)
    414       updated_machines.append(m)
    415 
    416     return updated_machines
    417 
    418   def _InternalRemoveMachine(self, machine):
    419     """Remove machine from internal list of machines.
    420 
    421     Args:
    422       machine: Name of machine to be removed from internal list.
    423     """
    424     # Check to see if machine is lab machine and if so, make sure it has
    425     # ".cros" on the end.
    426     cros_machine = machine
    427     if machine.find('rack') > 0 and machine.find('row') > 0:
    428       if machine.find('.cros') == -1:
    429         cros_machine = cros_machine + '.cros'
    430 
    431     self.machines = [
    432         m for m in self.machines if m != cros_machine and m != machine
    433     ]
    434 
    435   def CheckMachineLocks(self, machine_states, cmd):
    436     """Check that every machine in requested list is in the proper state.
    437 
    438     If the cmd is 'unlock' verify that every machine is locked by requestor.
    439     If the cmd is 'lock' verify that every machine is currently unlocked.
    440 
    441     Args:
    442       machine_states: A dictionary of the current state of every machine in
    443         the current AFELockManager's list of machines.  Normally obtained by
    444         calling AFELockManager::GetMachineStates.
    445       cmd: The user-requested action for the machines: 'lock' or 'unlock'.
    446 
    447     Raises:
    448       DontOwnLock: The lock on a requested machine is owned by someone else.
    449     """
    450     for k, state in machine_states.iteritems():
    451       if cmd == 'unlock':
    452         if not state['locked']:
    453           self.logger.LogWarning('Attempt to unlock already unlocked machine '
    454                                  '(%s).' % k)
    455           self._InternalRemoveMachine(k)
    456 
    457         if state['locked'] and state['locked_by'] != self.user:
    458           raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
    459                             'else (%s).' % (k, state['locked_by']))
    460       elif cmd == 'lock':
    461         if state['locked']:
    462           self.logger.LogWarning(
    463               'Attempt to lock already locked machine (%s)' % k)
    464           self._InternalRemoveMachine(k)
    465 
    466   def HasAFEServer(self, local):
    467     """Verifies that the AFELockManager has appropriate AFE server.
    468 
    469     Args:
    470       local: Boolean indicating whether we are checking for the local server
    471         (True) or for the global server (False).
    472 
    473     Returns:
    474       A boolean indicating if the AFELockManager has the requested AFE server.
    475     """
    476     if local:
    477       return self.local_afe is not None
    478     else:
    479       return self.afe is not None
    480 
    481   def GetMachineStates(self, cmd=''):
    482     """Gets the current state of all the requested machines.
    483 
    484     Gets the current state of all the requested machines, both from the HW lab
    485     sever and from the local server.  Stores the data in a dictionary keyed
    486     by machine name.
    487 
    488     Args:
    489       cmd: The command for which we are getting the machine states. This is
    490         important because if one of the requested machines is missing we raise
    491         an exception, unless the requested command is 'add'.
    492 
    493     Returns:
    494       A dictionary of machine states for all the machines in the AFELockManager
    495       object.
    496 
    497     Raises:
    498       NoAFEServer:  Cannot find the HW Lab or local AFE server.
    499       AFEAccessError:  An error occurred when querying the server about a
    500         machine.
    501     """
    502     if not self.HasAFEServer(False):
    503       raise NoAFEServer('Error: Cannot connect to main AFE server.')
    504 
    505     if self.local and not self.HasAFEServer(True):
    506       raise NoAFEServer('Error: Cannot connect to local AFE server.')
    507 
    508     machine_list = {}
    509     for m in self.machines:
    510       host_info = None
    511       cros_name = m + '.cros'
    512       if (m in self.toolchain_lab_machines or
    513           cros_name in self.toolchain_lab_machines):
    514         mod_host = m.split('.')[0]
    515         host_info = self.afe.get_hosts(hostname=mod_host)
    516         if not host_info:
    517           raise AFEAccessError('Unable to get information about %s from main'
    518                                ' autotest server.' % m)
    519       else:
    520         host_info = self.local_afe.get_hosts(hostname=m)
    521         if not host_info and cmd != 'add':
    522           raise AFEAccessError('Unable to get information about %s from '
    523                                'local autotest server.' % m)
    524       if host_info:
    525         host_info = host_info[0]
    526         name = host_info.hostname
    527         values = {}
    528         values['board'] = host_info.platform if host_info.platform else '??'
    529         values['locked'] = host_info.locked
    530         if host_info.locked:
    531           values['locked_by'] = host_info.locked_by
    532           values['lock_time'] = host_info.lock_time
    533         else:
    534           values['locked_by'] = ''
    535           values['lock_time'] = ''
    536         machine_list[name] = values
    537       else:
    538         machine_list[m] = {}
    539     return machine_list
    540 
    541 
    542 def Main(argv):
    543   """Parse the options, initialize lock manager and dispatch proper method.
    544 
    545   Args:
    546     argv: The options with which this script was invoked.
    547 
    548   Returns:
    549     0 unless an exception is raised.
    550   """
    551   parser = argparse.ArgumentParser()
    552 
    553   parser.add_argument(
    554       '--list',
    555       dest='cmd',
    556       action='store_const',
    557       const='status',
    558       help='List current status of all known machines.')
    559   parser.add_argument(
    560       '--lock',
    561       dest='cmd',
    562       action='store_const',
    563       const='lock',
    564       help='Lock given machine(s).')
    565   parser.add_argument(
    566       '--unlock',
    567       dest='cmd',
    568       action='store_const',
    569       const='unlock',
    570       help='Unlock given machine(s).')
    571   parser.add_argument(
    572       '--status',
    573       dest='cmd',
    574       action='store_const',
    575       const='status',
    576       help='List current status of given machine(s).')
    577   parser.add_argument(
    578       '--add_machine',
    579       dest='cmd',
    580       action='store_const',
    581       const='add',
    582       help='Add machine to local machine server.')
    583   parser.add_argument(
    584       '--remove_machine',
    585       dest='cmd',
    586       action='store_const',
    587       const='remove',
    588       help='Remove machine from the local machine server.')
    589   parser.add_argument(
    590       '--nolocal',
    591       dest='local',
    592       action='store_false',
    593       default=True,
    594       help='Do not try to use local machine server.')
    595   parser.add_argument(
    596       '--remote', dest='remote', help='machines on which to operate')
    597   parser.add_argument(
    598       '--chromeos_root',
    599       dest='chromeos_root',
    600       required=True,
    601       help='ChromeOS root to use for autotest scripts.')
    602   parser.add_argument(
    603       '--local_server',
    604       dest='local_server',
    605       default=None,
    606       help='Alternate local autotest server to use.')
    607   parser.add_argument(
    608       '--force',
    609       dest='force',
    610       action='store_true',
    611       default=False,
    612       help='Force lock/unlock of machines, even if not'
    613       ' current lock owner.')
    614 
    615   options = parser.parse_args(argv)
    616 
    617   if not options.remote and options.cmd != 'status':
    618     parser.error('No machines specified for operation.')
    619 
    620   if not os.path.isdir(options.chromeos_root):
    621     parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
    622 
    623   if not options.cmd:
    624     parser.error('No operation selected (--list, --status, --lock, --unlock,'
    625                  ' --add_machine, --remove_machine).')
    626 
    627   machine_list = []
    628   if options.remote:
    629     machine_list = options.remote.split()
    630 
    631   lock_manager = AFELockManager(machine_list, options.force,
    632                                 options.chromeos_root, options.local_server,
    633                                 options.local)
    634 
    635   machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
    636   cmd = options.cmd
    637 
    638   if cmd == 'status':
    639     lock_manager.ListMachineStates(machine_states)
    640 
    641   elif cmd == 'lock':
    642     if not lock_manager.force:
    643       lock_manager.CheckMachineLocks(machine_states, cmd)
    644       lock_manager.UpdateMachines(True)
    645 
    646   elif cmd == 'unlock':
    647     if not lock_manager.force:
    648       lock_manager.CheckMachineLocks(machine_states, cmd)
    649       lock_manager.UpdateMachines(False)
    650 
    651   elif cmd == 'add':
    652     lock_manager.AddMachinesToLocalServer()
    653 
    654   elif cmd == 'remove':
    655     lock_manager.RemoveMachinesFromLocalServer()
    656 
    657   return 0
    658 
    659 
    660 if __name__ == '__main__':
    661   sys.exit(Main(sys.argv[1:]))
    662