Home | History | Annotate | Download | only in toolchain-utils
      1 #!/usr/bin/python2
      2 #
      3 # Copyright 2015 Google INc.  All Rights Reserved.
      4 """This module controls locking and unlocking of test machines."""
      5 
      6 from __future__ import print_function
      7 
      8 import argparse
      9 import getpass
     10 import os
     11 import sys
     12 import traceback
     13 
     14 from cros_utils import logger
     15 from cros_utils import machines
     16 
     17 
     18 class AFELockException(Exception):
     19   """Base class for exceptions in this module."""
     20 
     21 
     22 class MachineNotPingable(AFELockException):
     23   """Raised when machine does not respond to ping."""
     24 
     25 
     26 class MissingHostInfo(AFELockException):
     27   """Raised when cannot find info about machine on machine servers."""
     28 
     29 
     30 class UpdateNonLocalMachine(AFELockException):
     31   """Raised when user requests to add/remove a ChromeOS HW Lab machine.."""
     32 
     33 
     34 class DuplicateAdd(AFELockException):
     35   """Raised when user requests to add a machine that's already on the server."""
     36 
     37 
     38 class UpdateServerError(AFELockException):
     39   """Raised when attempt to add/remove a machine from local server fails."""
     40 
     41 
     42 class LockingError(AFELockException):
     43   """Raised when server fails to lock/unlock machine as requested."""
     44 
     45 
     46 class DontOwnLock(AFELockException):
     47   """Raised when user attmepts to unlock machine locked by someone else."""
     48   # This should not be raised if the user specified '--force'
     49 
     50 
     51 class NoAFEServer(AFELockException):
     52   """Raised when cannot find/access the autotest server."""
     53 
     54 
     55 class AFEAccessError(AFELockException):
     56   """Raised when cannot get information about lab machine from lab server."""
     57 
     58 
     59 class AFELockManager(object):
     60   """Class for locking/unlocking machines vie Autotest Front End servers.
     61 
     62   This class contains methods for checking the locked status of machines
     63   on both the ChromeOS HW Lab AFE server and a local AFE server.  It also
     64   has methods for adding/removing machines from the local server, and for
     65   changing the lock status of machines on either server.  For the ChromeOS
     66   HW Lab, it only allows access to the toolchain team lab machines, as
     67   defined in toolchain-utils/crosperf/default_remotes.  By default it will
     68   look for a local server on chrotomation2.mtv.corp.google.com, but an
     69   alternative local AFE server can be supplied, if desired.
     70 
     71   !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
     72   thread/process of a program.  If you launch threads and try to call it
     73   from a thread, you will get an error.  This has to do with restrictions
     74   in the Python virtual machine (and signal handling) and cannot be changed.
     75   """
     76 
     77   LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com'
     78 
     79   def __init__(self,
     80                remotes,
     81                force_option,
     82                chromeos_root,
     83                local_server,
     84                use_local=True,
     85                log=None):
     86     """Initializes an AFELockManager object.
     87 
     88     Args:
     89       remotes: A list of machine names or ip addresses to be managed.  Names
     90         and ip addresses should be represented as strings.  If the list is
     91         empty, the lock manager will get all known machines.
     92       force_option: A Boolean indicating whether or not to force an unlock of
     93         a machine that was locked by someone else.
     94       chromeos_root: The ChromeOS chroot to use for the autotest scripts.
     95       local_server: A string containing the name or ip address of the machine
     96         that is running an AFE server, which is to be used for managing
     97         machines that are not in the ChromeOS HW lab.
     98       local: A Boolean indicating whether or not to use/allow a local AFE
     99         server to be used (see local_server argument).
    100       log: If not None, this is the logger object to be used for writing out
    101         informational output messages.  It is expected to be an instance of
    102         Logger class from cros_utils/logger.py.
    103     """
    104     self.chromeos_root = chromeos_root
    105     self.user = getpass.getuser()
    106     self.logger = log or logger.GetLogger()
    107     autotest_path = os.path.join(chromeos_root,
    108                                  'src/third_party/autotest/files')
    109 
    110     sys.path.append(chromeos_root)
    111     sys.path.append(autotest_path)
    112     sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
    113 
    114     # We have to wait to do these imports until the paths above have
    115     # been fixed.
    116     # pylint: disable=import-error
    117     from client import setup_modules
    118     setup_modules.setup(
    119         base_path=autotest_path, root_module_name='autotest_lib')
    120 
    121     from dynamic_suite import frontend_wrappers
    122 
    123     self.afe = frontend_wrappers.RetryingAFE(
    124         timeout_min=30, delay_sec=10, debug=False, server='cautotest')
    125 
    126     self.local = use_local
    127     self.machines = list(set(remotes)) or []
    128     self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
    129     if self.machines and self.AllLabMachines():
    130       self.local = False
    131 
    132     if not self.local:
    133       self.local_afe = None
    134     else:
    135       dargs = {}
    136       dargs['server'] = local_server or AFELockManager.LOCAL_SERVER
    137       # Make sure local server is pingable.
    138       error_msg = ('Local autotest server machine %s not responding to ping.' %
    139                    dargs['server'])
    140       self.CheckMachine(dargs['server'], error_msg)
    141       self.local_afe = frontend_wrappers.RetryingAFE(
    142           timeout_min=30, delay_sec=10, debug=False, **dargs)
    143     if not self.machines:
    144       self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines()
    145     self.force = force_option
    146 
    147   def AllLabMachines(self):
    148     """Check to see if all machines being used are HW Lab machines."""
    149     all_lab = True
    150     for m in self.machines:
    151       if m not in self.toolchain_lab_machines:
    152         all_lab = False
    153         break
    154     return all_lab
    155 
    156   def CheckMachine(self, machine, error_msg):
    157     """Verifies that machine is responding to ping.
    158 
    159     Args:
    160       machine: String containing the name or ip address of machine to check.
    161       error_msg: Message to print if ping fails.
    162 
    163     Raises:
    164       MachineNotPingable:  If machine is not responding to 'ping'
    165     """
    166     if not machines.MachineIsPingable(machine, logging_level='none'):
    167       cros_machine = machine + '.cros'
    168       if not machines.MachineIsPingable(cros_machine, logging_level='none'):
    169         raise MachineNotPingable(error_msg)
    170 
    171   def MachineIsKnown(self, machine):
    172     """Checks to see if either AFE server knows the given machine.
    173 
    174     Args:
    175       machine: String containing name or ip address of machine to check.
    176 
    177     Returns:
    178       Boolean indicating if the machine is in the list of known machines for
    179         either AFE server.
    180     """
    181     if machine in self.toolchain_lab_machines:
    182       return True
    183     elif self.local_afe and machine in self.GetAllNonlabMachines():
    184       return True
    185 
    186     return False
    187 
    188   def GetAllToolchainLabMachines(self):
    189     """Gets a list of all the toolchain machines in the ChromeOS HW lab.
    190 
    191     Returns:
    192       A list of names of the toolchain machines in the ChromeOS HW lab.
    193     """
    194     machines_file = os.path.join(
    195         os.path.dirname(__file__), 'crosperf', 'default_remotes')
    196     machine_list = []
    197     with open(machines_file, 'r') as input_file:
    198       lines = input_file.readlines()
    199       for line in lines:
    200         _, remotes = line.split(':')
    201         remotes = remotes.strip()
    202         for r in remotes.split():
    203           machine_list.append(r.strip())
    204     return machine_list
    205 
    206   def GetAllNonlabMachines(self):
    207     """Gets a list of all known machines on the local AFE server.
    208 
    209     Returns:
    210       A list of the names of the machines on the local AFE server.
    211     """
    212     non_lab_machines = []
    213     if self.local_afe:
    214       non_lab_machines = self.local_afe.get_hostnames()
    215     return non_lab_machines
    216 
    217   def PrintStatusHeader(self, is_lab_machine):
    218     """Prints the status header lines for machines.
    219 
    220     Args:
    221       is_lab_machine: Boolean indicating whether to print HW Lab header or
    222         local machine header (different spacing).
    223     """
    224     if is_lab_machine:
    225       print('\nMachine (Board)\t\t\t\t\tStatus')
    226       print('---------------\t\t\t\t\t------\n')
    227     else:
    228       print('\nMachine (Board)\t\tStatus')
    229       print('---------------\t\t------\n')
    230 
    231   def RemoveLocalMachine(self, m):
    232     """Removes a machine from the local AFE server.
    233 
    234     Args:
    235       m: The machine to remove.
    236 
    237     Raises:
    238       MissingHostInfo:  Can't find machine to be removed.
    239     """
    240     if self.local_afe:
    241       host_info = self.local_afe.get_hosts(hostname=m)
    242       if host_info:
    243         host_info = host_info[0]
    244         host_info.delete()
    245       else:
    246         raise MissingHostInfo('Cannot find/delete machine %s.' % m)
    247 
    248   def AddLocalMachine(self, m):
    249     """Adds a machine to the local AFE server.
    250 
    251     Args:
    252       m: The machine to be added.
    253     """
    254     if self.local_afe:
    255       error_msg = 'Machine %s is not responding to ping.' % m
    256       self.CheckMachine(m, error_msg)
    257       self.local_afe.create_host(m)
    258 
    259   def AddMachinesToLocalServer(self):
    260     """Adds one or more machines to the local AFE server.
    261 
    262     Verify that the requested machines are legal to add to the local server,
    263     i.e. that they are not ChromeOS HW lab machines, and they are not already
    264     on the local server.  Call AddLocalMachine for each valid machine.
    265 
    266     Raises:
    267       DuplicateAdd: Attempt to add a machine that is already on the server.
    268       UpdateNonLocalMachine:  Attempt to add a ChromeOS HW lab machine.
    269       UpdateServerError:  Something went wrong while attempting to add a
    270         machine.
    271     """
    272     for m in self.machines:
    273       for cros_name in [m, m + '.cros']:
    274         if cros_name in self.toolchain_lab_machines:
    275           raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW'
    276                                       'Lab.  Cannot add it to local server.' %
    277                                       cros_name)
    278       host_info = self.local_afe.get_hosts(hostname=m)
    279       if host_info:
    280         raise DuplicateAdd('Machine %s is already on the local server.' % m)
    281       try:
    282         self.AddLocalMachine(m)
    283         self.logger.LogOutput('Successfully added %s to local server.' % m)
    284       except Exception as e:
    285         traceback.print_exc()
    286         raise UpdateServerError(
    287             'Error occurred while attempting to add %s. %s' % (m, str(e)))
    288 
    289   def RemoveMachinesFromLocalServer(self):
    290     """Removes one or more machines from the local AFE server.
    291 
    292     Verify that the requested machines are legal to remove from the local
    293     server, i.e. that they are not ChromeOS HW lab machines.  Call
    294     RemoveLocalMachine for each valid machine.
    295 
    296     Raises:
    297       UpdateServerError:  Something went wrong while attempting to remove a
    298         machine.
    299     """
    300     for m in self.machines:
    301       for cros_name in [m, m + '.cros']:
    302         if cros_name in self.toolchain_lab_machines:
    303           raise UpdateNonLocalMachine(
    304               'Machine %s is in the ChromeOS HW Lab. '
    305               'This script cannot remove lab machines.' % cros_name)
    306       try:
    307         self.RemoveLocalMachine(m)
    308         self.logger.LogOutput('Successfully removed %s from local server.' % m)
    309       except Exception as e:
    310         traceback.print_exc()
    311         raise UpdateServerError('Error occurred while attempting to remove %s '
    312                                 '(%s).' % (m, str(e)))
    313 
    314   def ListMachineStates(self, machine_states):
    315     """Gets and prints the current status for a list of machines.
    316 
    317     Prints out the current status for all of the machines in the current
    318     AFELockManager's list of machines (set when the object is initialized).
    319 
    320     Args:
    321       machine_states: A dictionary of the current state of every machine in
    322         the current AFELockManager's list of machines.  Normally obtained by
    323         calling AFELockManager::GetMachineStates.
    324     """
    325     local_machines = []
    326     printed_hdr = False
    327     for m in machine_states:
    328       cros_name = m + '.cros'
    329       if (m in self.toolchain_lab_machines or
    330           cros_name in self.toolchain_lab_machines):
    331         name = m if m in self.toolchain_lab_machines else cros_name
    332         if not printed_hdr:
    333           self.PrintStatusHeader(True)
    334           printed_hdr = True
    335         state = machine_states[m]
    336         if state['locked']:
    337           print('%s (%s)\tlocked by %s since %s' %
    338                 (name, state['board'], state['locked_by'], state['lock_time']))
    339         else:
    340           print('%s (%s)\tunlocked' % (name, state['board']))
    341       else:
    342         local_machines.append(m)
    343 
    344     if local_machines:
    345       self.PrintStatusHeader(False)
    346       for m in local_machines:
    347         state = machine_states[m]
    348         if state['locked']:
    349           print('%s (%s)\tlocked by %s since %s' %
    350                 (m, state['board'], state['locked_by'], state['lock_time']))
    351         else:
    352           print('%s (%s)\tunlocked' % (m, state['board']))
    353 
    354   def UpdateLockInAFE(self, should_lock_machine, machine):
    355     """Calls an AFE server to lock/unlock a machine.
    356 
    357     Args:
    358       should_lock_machine: Boolean indicating whether to lock the machine (True)
    359         or unlock the machine (False).
    360       machine: The machine to update.
    361 
    362     Raises:
    363       LockingError:  An error occurred while attempting to update the machine
    364         state.
    365     """
    366     action = 'lock'
    367     if not should_lock_machine:
    368       action = 'unlock'
    369     kwargs = {'locked': should_lock_machine}
    370     kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
    371 
    372     cros_name = machine + '.cros'
    373     if cros_name in self.toolchain_lab_machines:
    374       machine = cros_name
    375     if machine in self.toolchain_lab_machines:
    376       m = machine.split('.')[0]
    377       afe_server = self.afe
    378     else:
    379       m = machine
    380       afe_server = self.local_afe
    381 
    382     try:
    383       afe_server.run('modify_hosts',
    384                      host_filter_data={'hostname__in': [m]},
    385                      update_data=kwargs)
    386     except Exception as e:
    387       traceback.print_exc()
    388       raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e)))
    389 
    390   def UpdateMachines(self, lock_machines):
    391     """Sets the locked state of the machines to the requested value.
    392 
    393     The machines updated are the ones in self.machines (specified when the
    394     class object was intialized).
    395 
    396     Args:
    397       lock_machines: Boolean indicating whether to lock the machines (True) or
    398         unlock the machines (False).
    399 
    400     Returns:
    401       A list of the machines whose state was successfully updated.
    402     """
    403     updated_machines = []
    404     for m in self.machines:
    405       self.UpdateLockInAFE(lock_machines, m)
    406       # Since we returned from self.UpdateLockInAFE we assume the request
    407       # succeeded.
    408       if lock_machines:
    409         self.logger.LogOutput('Locked machine(s) %s.' % m)
    410       else:
    411         self.logger.LogOutput('Unlocked machine(s) %s.' % m)
    412       updated_machines.append(m)
    413 
    414     return updated_machines
    415 
    416   def _InternalRemoveMachine(self, machine):
    417     """Remove machine from internal list of machines.
    418 
    419     Args:
    420       machine: Name of machine to be removed from internal list.
    421     """
    422     # Check to see if machine is lab machine and if so, make sure it has
    423     # ".cros" on the end.
    424     cros_machine = machine
    425     if machine.find('rack') > 0 and machine.find('row') > 0:
    426       if machine.find('.cros') == -1:
    427         cros_machine = cros_machine + '.cros'
    428 
    429     self.machines = [m for m in self.machines
    430                      if m != cros_machine and m != machine]
    431 
    432   def CheckMachineLocks(self, machine_states, cmd):
    433     """Check that every machine in requested list is in the proper state.
    434 
    435     If the cmd is 'unlock' verify that every machine is locked by requestor.
    436     If the cmd is 'lock' verify that every machine is currently unlocked.
    437 
    438     Args:
    439       machine_states: A dictionary of the current state of every machine in
    440         the current AFELockManager's list of machines.  Normally obtained by
    441         calling AFELockManager::GetMachineStates.
    442       cmd: The user-requested action for the machines: 'lock' or 'unlock'.
    443 
    444     Raises:
    445       DontOwnLock: The lock on a requested machine is owned by someone else.
    446     """
    447     for k, state in machine_states.iteritems():
    448       if cmd == 'unlock':
    449         if not state['locked']:
    450           self.logger.LogWarning('Attempt to unlock already unlocked machine '
    451                                  '(%s).' % k)
    452           self._InternalRemoveMachine(k)
    453 
    454         if state['locked'] and state['locked_by'] != self.user:
    455           raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
    456                             'else (%s).' % (k, state['locked_by']))
    457       elif cmd == 'lock':
    458         if state['locked']:
    459           self.logger.LogWarning('Attempt to lock already locked machine (%s)' %
    460                                  k)
    461           self._InternalRemoveMachine(k)
    462 
    463   def HasAFEServer(self, local):
    464     """Verifies that the AFELockManager has appropriate AFE server.
    465 
    466     Args:
    467       local: Boolean indicating whether we are checking for the local server
    468         (True) or for the global server (False).
    469 
    470     Returns:
    471       A boolean indicating if the AFELockManager has the requested AFE server.
    472     """
    473     if local:
    474       return self.local_afe is not None
    475     else:
    476       return self.afe is not None
    477 
    478   def GetMachineStates(self, cmd=''):
    479     """Gets the current state of all the requested machines.
    480 
    481     Gets the current state of all the requested machines, both from the HW lab
    482     sever and from the local server.  Stores the data in a dictionary keyed
    483     by machine name.
    484 
    485     Args:
    486       cmd: The command for which we are getting the machine states. This is
    487         important because if one of the requested machines is missing we raise
    488         an exception, unless the requested command is 'add'.
    489 
    490     Returns:
    491       A dictionary of machine states for all the machines in the AFELockManager
    492       object.
    493 
    494     Raises:
    495       NoAFEServer:  Cannot find the HW Lab or local AFE server.
    496       AFEAccessError:  An error occurred when querying the server about a
    497         machine.
    498     """
    499     if not self.HasAFEServer(False):
    500       raise NoAFEServer('Error: Cannot connect to main AFE server.')
    501 
    502     if self.local and not self.HasAFEServer(True):
    503       raise NoAFEServer('Error: Cannot connect to local AFE server.')
    504 
    505     machine_list = {}
    506     for m in self.machines:
    507       host_info = None
    508       cros_name = m + '.cros'
    509       if (m in self.toolchain_lab_machines or
    510           cros_name in self.toolchain_lab_machines):
    511         mod_host = m.split('.')[0]
    512         host_info = self.afe.get_hosts(hostname=mod_host)
    513         if not host_info:
    514           raise AFEAccessError('Unable to get information about %s from main'
    515                                ' autotest server.' % m)
    516       else:
    517         host_info = self.local_afe.get_hosts(hostname=m)
    518         if not host_info and cmd != 'add':
    519           raise AFEAccessError('Unable to get information about %s from '
    520                                'local autotest server.' % m)
    521       if host_info:
    522         host_info = host_info[0]
    523         name = host_info.hostname
    524         values = {}
    525         values['board'] = host_info.platform if host_info.platform else '??'
    526         values['locked'] = host_info.locked
    527         if host_info.locked:
    528           values['locked_by'] = host_info.locked_by
    529           values['lock_time'] = host_info.lock_time
    530         else:
    531           values['locked_by'] = ''
    532           values['lock_time'] = ''
    533         machine_list[name] = values
    534       else:
    535         machine_list[m] = {}
    536     return machine_list
    537 
    538 
    539 def Main(argv):
    540   """Parse the options, initialize lock manager and dispatch proper method.
    541 
    542   Args:
    543     argv: The options with which this script was invoked.
    544 
    545   Returns:
    546     0 unless an exception is raised.
    547   """
    548   parser = argparse.ArgumentParser()
    549 
    550   parser.add_argument(
    551       '--list',
    552       dest='cmd',
    553       action='store_const',
    554       const='status',
    555       help='List current status of all known machines.')
    556   parser.add_argument(
    557       '--lock',
    558       dest='cmd',
    559       action='store_const',
    560       const='lock',
    561       help='Lock given machine(s).')
    562   parser.add_argument(
    563       '--unlock',
    564       dest='cmd',
    565       action='store_const',
    566       const='unlock',
    567       help='Unlock given machine(s).')
    568   parser.add_argument(
    569       '--status',
    570       dest='cmd',
    571       action='store_const',
    572       const='status',
    573       help='List current status of given machine(s).')
    574   parser.add_argument(
    575       '--add_machine',
    576       dest='cmd',
    577       action='store_const',
    578       const='add',
    579       help='Add machine to local machine server.')
    580   parser.add_argument(
    581       '--remove_machine',
    582       dest='cmd',
    583       action='store_const',
    584       const='remove',
    585       help='Remove machine from the local machine server.')
    586   parser.add_argument(
    587       '--nolocal',
    588       dest='local',
    589       action='store_false',
    590       default=True,
    591       help='Do not try to use local machine server.')
    592   parser.add_argument(
    593       '--remote', dest='remote', help='machines on which to operate')
    594   parser.add_argument(
    595       '--chromeos_root',
    596       dest='chromeos_root',
    597       required=True,
    598       help='ChromeOS root to use for autotest scripts.')
    599   parser.add_argument(
    600       '--local_server',
    601       dest='local_server',
    602       default=None,
    603       help='Alternate local autotest server to use.')
    604   parser.add_argument(
    605       '--force',
    606       dest='force',
    607       action='store_true',
    608       default=False,
    609       help='Force lock/unlock of machines, even if not'
    610       ' current lock owner.')
    611 
    612   options = parser.parse_args(argv)
    613 
    614   if not options.remote and options.cmd != 'status':
    615     parser.error('No machines specified for operation.')
    616 
    617   if not os.path.isdir(options.chromeos_root):
    618     parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
    619 
    620   if not options.cmd:
    621     parser.error('No operation selected (--list, --status, --lock, --unlock,'
    622                  ' --add_machine, --remove_machine).')
    623 
    624   machine_list = []
    625   if options.remote:
    626     machine_list = options.remote.split()
    627 
    628   lock_manager = AFELockManager(machine_list, options.force,
    629                                 options.chromeos_root, options.local_server,
    630                                 options.local)
    631 
    632   machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
    633   cmd = options.cmd
    634 
    635   if cmd == 'status':
    636     lock_manager.ListMachineStates(machine_states)
    637 
    638   elif cmd == 'lock':
    639     if not lock_manager.force:
    640       lock_manager.CheckMachineLocks(machine_states, cmd)
    641       lock_manager.UpdateMachines(True)
    642 
    643   elif cmd == 'unlock':
    644     if not lock_manager.force:
    645       lock_manager.CheckMachineLocks(machine_states, cmd)
    646       lock_manager.UpdateMachines(False)
    647 
    648   elif cmd == 'add':
    649     lock_manager.AddMachinesToLocalServer()
    650 
    651   elif cmd == 'remove':
    652     lock_manager.RemoveMachinesFromLocalServer()
    653 
    654   return 0
    655 
    656 
    657 if __name__ == '__main__':
    658   sys.exit(Main(sys.argv[1:]))
    659