Home | History | Annotate | Download | only in chaos_lib
      1 # Copyright 2016 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import contextlib
      6 import datetime
      7 import logging
      8 import pprint
      9 import time
     10 
     11 import common
     12 from autotest_lib.client.common_lib import error, site_utils
     13 from autotest_lib.client.common_lib import utils as base_utils
     14 from autotest_lib.client.common_lib.cros.network import ap_constants
     15 from autotest_lib.client.common_lib.cros.network import iw_runner
     16 from autotest_lib.server import hosts
     17 from autotest_lib.server import site_linux_system
     18 from autotest_lib.server.cros import host_lock_manager
     19 from autotest_lib.server.cros.ap_configurators import ap_batch_locker
     20 from autotest_lib.server.cros.ap_configurators \
     21         import ap_configurator_factory
     22 from autotest_lib.server.cros.network import chaos_clique_utils as utils
     23 from autotest_lib.server.cros.network import wifi_client
     24 from autotest_lib.server.hosts import adb_host
     25 
     26 # Webdriver master hostname
     27 MASTERNAME = 'chromeos3-chaosvmmaster.cros.corp.google.com'
     28 WEBDRIVER_PORT = 9515
     29 
     30 
     31 class ChaosRunner(object):
     32     """Object to run a network_WiFi_ChaosXXX test."""
     33 
     34 
     35     def __init__(self, test, host, spec, broken_pdus=list()):
     36         """Initializes and runs test.
     37 
     38         @param test: a string, test name.
     39         @param host: an Autotest host object, device under test.
     40         @param spec: an APSpec object.
     41         @param broken_pdus: list of offline PDUs.
     42 
     43         """
     44         self._test = test
     45         self._host = host
     46         self._ap_spec = spec
     47         self._broken_pdus = broken_pdus
     48         # Log server and DUT times
     49         dt = datetime.datetime.now()
     50         logging.info('Server time: %s', dt.strftime('%a %b %d %H:%M:%S %Y'))
     51         logging.info('DUT time: %s', self._host.run('date').stdout.strip())
     52 
     53 
     54     def run(self, job, batch_size=10, tries=10, capturer_hostname=None,
     55             conn_worker=None, work_client_hostname=None,
     56             disabled_sysinfo=False):
     57         """Executes Chaos test.
     58 
     59         @param job: an Autotest job object.
     60         @param batch_size: an integer, max number of APs to lock in one batch.
     61         @param tries: an integer, number of iterations to run per AP.
     62         @param capturer_hostname: a string or None, hostname or IP of capturer.
     63         @param conn_worker: ConnectionWorkerAbstract or None, to run extra
     64                             work after successful connection.
     65         @param work_client_hostname: a string or None, hostname of work client
     66         @param disabled_sysinfo: a bool, disable collection of logs from DUT.
     67 
     68 
     69         @raises TestError: Issues locking VM webdriver instance
     70         """
     71 
     72         lock_manager = host_lock_manager.HostLockManager()
     73         webdriver_master = hosts.SSHHost(MASTERNAME, user='chaosvmmaster')
     74         host_prefix = self._host.hostname.split('-')[0]
     75         with host_lock_manager.HostsLockedBy(lock_manager):
     76             capture_host = utils.allocate_packet_capturer(
     77                     lock_manager, hostname=capturer_hostname,
     78                     prefix=host_prefix)
     79             # Cleanup and reboot packet capturer before the test.
     80             utils.sanitize_client(capture_host)
     81             capturer = site_linux_system.LinuxSystem(capture_host, {},
     82                                                      'packet_capturer')
     83 
     84             # Run iw scan and abort if more than allowed number of APs are up.
     85             iw_command = iw_runner.IwRunner(capture_host)
     86             start_time = time.time()
     87             logging.info('Performing a scan with a max timeout of 30 seconds.')
     88             capture_interface = 'wlan0'
     89             capturer_info = capture_host.run('cat /etc/lsb-release',
     90                                              ignore_status=True, timeout=5).stdout
     91             if 'whirlwind' in capturer_info:
     92                 # Use the dual band aux radio for scanning networks.
     93                 capture_interface = 'wlan2'
     94             while time.time() - start_time <= ap_constants.MAX_SCAN_TIMEOUT:
     95                 networks = iw_command.scan(capture_interface)
     96                 if networks is None:
     97                     if (time.time() - start_time ==
     98                             ap_constants.MAX_SCAN_TIMEOUT):
     99                         raise error.TestError(
    100                             'Packet capturer is not responding to scans. Check'
    101                             'device and re-run test')
    102                     continue
    103                 elif len(networks) < ap_constants.MAX_SSID_COUNT:
    104                     break
    105                 elif len(networks) >= ap_constants.MAX_SSID_COUNT:
    106                     raise error.TestError(
    107                         'Probably someone is already running a '
    108                         'chaos test?!')
    109 
    110             if conn_worker is not None:
    111                 work_client_machine = utils.allocate_packet_capturer(
    112                         lock_manager, hostname=work_client_hostname)
    113                 conn_worker.prepare_work_client(work_client_machine)
    114 
    115             # Lock VM. If on, power off; always power on. Then create a tunnel.
    116             webdriver_instance = utils.allocate_webdriver_instance(lock_manager)
    117 
    118             if utils.is_VM_running(webdriver_master, webdriver_instance):
    119                 logging.info('VM %s was on; powering off for a clean instance',
    120                              webdriver_instance)
    121                 utils.power_off_VM(webdriver_master, webdriver_instance)
    122                 logging.info('Allow VM time to gracefully shut down')
    123                 time.sleep(5)
    124 
    125             logging.info('Starting up VM %s', webdriver_instance)
    126             utils.power_on_VM(webdriver_master, webdriver_instance)
    127             logging.info('Allow VM time to power on before creating a tunnel.')
    128             time.sleep(5)
    129 
    130             if not site_utils.host_is_in_lab_zone(webdriver_instance.hostname):
    131                 self._ap_spec._webdriver_hostname = webdriver_instance.hostname
    132             else:
    133                 # If in the lab then port forwarding must be done so webdriver
    134                 # connection will be over localhost.
    135                 self._ap_spec._webdriver_hostname = 'localhost'
    136                 webdriver_tunnel = webdriver_instance.create_ssh_tunnel(
    137                                                 WEBDRIVER_PORT, WEBDRIVER_PORT)
    138                 logging.info('Wait for tunnel to be created.')
    139                 for i in range(3):
    140                     time.sleep(10)
    141                     results = base_utils.run('lsof -i:%s' % WEBDRIVER_PORT,
    142                                              ignore_status=True)
    143                     if results:
    144                         break
    145                 if not results:
    146                     raise error.TestError(
    147                             'Unable to listen to WEBDRIVER_PORT: %s', results)
    148 
    149             batch_locker = ap_batch_locker.ApBatchLocker(
    150                     lock_manager, self._ap_spec,
    151                     ap_test_type=ap_constants.AP_TEST_TYPE_CHAOS)
    152 
    153             while batch_locker.has_more_aps():
    154                 # Work around for CrOS devices only:crbug.com/358716
    155                 # Do not reboot Android devices:b/27977927
    156                 if self._host.get_os_type() != adb_host.OS_TYPE_ANDROID:
    157                     utils.sanitize_client(self._host)
    158                 healthy_dut = True
    159 
    160                 with contextlib.closing(wifi_client.WiFiClient(
    161                     hosts.create_host({'hostname' : self._host.hostname,
    162                             'afe_host' : self._host._afe_host},
    163                             host_class=self._host.__class__),
    164                     './debug', False)) as client:
    165 
    166                     aps = batch_locker.get_ap_batch(batch_size=batch_size)
    167                     if not aps:
    168                         logging.info('No more APs to test.')
    169                         break
    170 
    171                     # Power down all of the APs because some can get grumpy
    172                     # if they are configured several times and remain on.
    173                     # User the cartridge to down group power downs and
    174                     # configurations.
    175                     utils.power_down_aps(aps, self._broken_pdus)
    176                     utils.configure_aps(aps, self._ap_spec, self._broken_pdus)
    177 
    178                     aps = utils.filter_quarantined_and_config_failed_aps(aps,
    179                             batch_locker, job, self._broken_pdus)
    180 
    181                     for ap in aps:
    182                         # http://crbug.com/306687
    183                         if ap.ssid == None:
    184                             logging.error('The SSID was not set for the AP:%s',
    185                                           ap)
    186 
    187                         healthy_dut = utils.is_dut_healthy(client, ap)
    188 
    189                         if not healthy_dut:
    190                             logging.error('DUT is not healthy, rebooting.')
    191                             batch_locker.unlock_and_reclaim_aps()
    192                             break
    193 
    194                         networks = utils.return_available_networks(
    195                                 ap, capturer, job, self._ap_spec)
    196 
    197                         if networks is None:
    198                             # If scan returned no networks, iw scan failed.
    199                             # Reboot the packet capturer device and
    200                             # reconfigure the capturer.
    201                             batch_locker.unlock_and_reclaim_ap(ap.host_name)
    202                             logging.error('Packet capture is not healthy, '
    203                                           'rebooting.')
    204                             capturer.host.reboot()
    205                             capturer = site_linux_system.LinuxSystem(
    206                                            capture_host, {},'packet_capturer')
    207                             continue
    208                         if networks == list():
    209                            # Packet capturer did not find the SSID in scan or
    210                            # there was a security mismatch.
    211                            utils.release_ap(ap, batch_locker, self._broken_pdus)
    212                            continue
    213 
    214                         assoc_params = ap.get_association_parameters()
    215 
    216                         if not utils.is_conn_worker_healthy(
    217                                 conn_worker, ap, assoc_params, job):
    218                             utils.release_ap(
    219                                     ap, batch_locker, self._broken_pdus)
    220                             continue
    221 
    222                         name = ap.name
    223                         kernel_ver = self._host.get_kernel_ver()
    224                         firmware_ver = utils.get_firmware_ver(self._host)
    225                         if not firmware_ver:
    226                             firmware_ver = "Unknown"
    227 
    228                         debug_dict = {'+++PARSE DATA+++': '+++PARSE DATA+++',
    229                                       'SSID': ap._ssid,
    230                                       'DUT': client.wifi_mac,
    231                                       'AP Info': ap.name,
    232                                       'kernel_version': kernel_ver,
    233                                       'wifi_firmware_version': firmware_ver}
    234                         debug_string = pprint.pformat(debug_dict)
    235 
    236                         logging.info('Waiting %d seconds for the AP dhcp '
    237                                      'server', ap.dhcp_delay)
    238                         time.sleep(ap.dhcp_delay)
    239 
    240                         result = job.run_test(self._test,
    241                                      capturer=capturer,
    242                                      capturer_frequency=networks[0].frequency,
    243                                      capturer_ht_type=networks[0].ht,
    244                                      host=self._host,
    245                                      assoc_params=assoc_params,
    246                                      client=client,
    247                                      tries=tries,
    248                                      debug_info=debug_string,
    249                                      # Copy all logs from the system
    250                                      disabled_sysinfo=disabled_sysinfo,
    251                                      conn_worker=conn_worker,
    252                                      tag=ap.ssid if conn_worker is None else
    253                                          '%s.%s' % (conn_worker.name, ap.ssid))
    254 
    255                         utils.release_ap(ap, batch_locker, self._broken_pdus)
    256 
    257                         if conn_worker is not None:
    258                             conn_worker.cleanup()
    259 
    260                     if not healthy_dut:
    261                         continue
    262 
    263                 batch_locker.unlock_aps()
    264 
    265             if webdriver_tunnel:
    266                 webdriver_instance.disconnect_ssh_tunnel(webdriver_tunnel,
    267                                                          WEBDRIVER_PORT)
    268                 webdriver_instance.close()
    269             capturer.close()
    270             logging.info('Powering off VM %s', webdriver_instance)
    271             utils.power_off_VM(webdriver_master, webdriver_instance)
    272             lock_manager.unlock(webdriver_instance.hostname)
    273 
    274             if self._broken_pdus:
    275                 logging.info('PDU is down!!!\nThe following PDUs are down:\n')
    276                 pprint.pprint(self._broken_pdus)
    277 
    278             factory = ap_configurator_factory.APConfiguratorFactory(
    279                     ap_constants.AP_TEST_TYPE_CHAOS)
    280             factory.turn_off_all_routers(self._broken_pdus)
    281