Home | History | Annotate | Download | only in platform_KernelErrorPaths
      1 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import logging, os, time
      6 
      7 from autotest_lib.client.common_lib import error
      8 from autotest_lib.client.cros import constants
      9 from autotest_lib.client.cros.crash.crash_test import CrashTest as CrashTestDefs
     10 from autotest_lib.server import test
     11 
     12 class platform_KernelErrorPaths(test.test):
     13     """Performs various kernel crash tests and makes sure that the expected
     14        results are found in the crash report."""
     15     version = 1
     16 
     17     def _run_client_command(self, command):
     18         try:
     19             # Simply sending the trigger into lkdtm resets the target
     20             # immediately, leaving files unsaved to disk and the master ssh
     21             # connection wedged for a long time.
     22             self.client.run(
     23                 'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command)
     24         except error.AutoservRunError, e:
     25             # It is expected that this will cause a non-zero exit status.
     26             pass
     27 
     28     def _provoke_crash(self, interface, trigger, cpu):
     29         """
     30         This test is ensuring that the machine will reboot on any
     31         type of kernel panic.  If the sysctls below are not set
     32         correctly, the machine will not reboot.  After verifying
     33         that the machine has the proper sysctl state, we make it
     34         reboot by writing to lkdtm.
     35 
     36         @param interface: which filesystem interface to write into
     37         @param trigger: the text string to write for triggering a crash
     38         @param cpu: None or a specific cpu number to pin before crashing
     39         """
     40         self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"');
     41         self.client.run('sysctl kernel.panic_on_oops|'
     42                         'grep "kernel.panic_on_oops = 1"');
     43 
     44         if cpu != None:
     45             # Run on a specific CPU using taskset
     46             command = "echo %s | taskset -c %d tee %s" % (trigger, cpu,
     47                                                           interface)
     48         else:
     49             # Run normally
     50             command = "echo %s > %s" % (trigger, interface)
     51 
     52         logging.info("KernelErrorPaths: executing '%s' on %s",
     53                      command, self.client.hostname)
     54         self._run_client_command(command)
     55 
     56     def _exists_on_client(self, f):
     57         return self.client.run('ls "%s"' % f,
     58                                ignore_status=True).exit_status == 0
     59 
     60     def _enable_consent(self):
     61         """ Enable consent so that crashes get stored in /var/spool/crash. """
     62         self._consent_files = [
     63             (CrashTestDefs._PAUSE_FILE, None, 'chronos'),
     64             (CrashTestDefs._CONSENT_FILE, None, 'chronos'),
     65             (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'),
     66             (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'),
     67             ]
     68         for dst, src, owner in self._consent_files:
     69             if self._exists_on_client(dst):
     70                 self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst))
     71             if src:
     72                 full_src = os.path.join(self.autodir, 'client/cros', src)
     73                 self.client.send_file(full_src, dst)
     74             else:
     75                 self.client.run('touch "%s"' % dst)
     76             self.client.run('chown "%s" "%s"' % (owner, dst))
     77 
     78     def _restore_consent_files(self):
     79         """ Restore consent files to their previous values. """
     80         for f, _, _ in self._consent_files:
     81             self.client.run('rm -f "%s"' % f)
     82             if self._exists_on_client('%s.autotest_backup' % f):
     83                 self.client.run('mv "%s.autotest_backup" "%s"' % (f, f))
     84 
     85     def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0,
     86                                     timeout=10):
     87         """
     88         Wait for panic reboot to complete and check @text in kcrash file.
     89 
     90         @param bootid: Boot ID of the current boot.
     91         @param trigger: Text string that specifies what caused the panic/reboot.
     92         @param text: Text string to match in the kcrash file.
     93         @param cpu: CPU on which the trigger happened.
     94         @param timeout: Time to wait for the remote host to go down.
     95 
     96         @raises error.TestFail if the @text string is not found in kcrash file.
     97         """
     98         try:
     99             self.client.wait_for_restart(
    100                 down_timeout=timeout,
    101                 down_warning=timeout,
    102                 old_boot_id=boot_id,
    103                 # Extend the default reboot timeout as some targets take
    104                 # longer than normal before ssh is available again.
    105                 timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4)
    106         except error.AutoservShutdownError:
    107             self.client.run('ps alx')
    108             raise
    109 
    110         # give the crash_reporter some time to log the crash
    111         time.sleep(5)
    112 
    113         # check if dir /var/spool/crash exists on client or not
    114         if not self._exists_on_client(self._crash_log_dir):
    115             raise error.TestFail(
    116                 '%s does not exists on client' % self._crash_log_dir)
    117 
    118         # check if kernel.*.kcrash files are on the client or not
    119         kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir
    120         if not self.client.list_files_glob(kcrash_file_path):
    121             raise error.TestFail('No kcrash files found on client')
    122 
    123         result = self.client.run('cat %s/kernel.*.kcrash' %
    124                                  self._crash_log_dir)
    125         if not type(text) == tuple:
    126            match = (text,)
    127         else:
    128            match = text
    129         if not any(s in result.stdout for s in match):
    130             raise error.TestFail(
    131                 "'%s' not found in log after sending '%s' on cpu %d" %
    132                 ((match,), trigger, cpu))
    133 
    134     def _client_run_output(self, cmd):
    135         return self.client.run(cmd).stdout.strip()
    136 
    137     def _get_pid(self, comm, parent):
    138         """
    139         Fetch PID of process named comm.
    140 
    141         This function tries to lookup the PID for process named @comm. If
    142         @parent is not None, the parent process is first looked up and then the
    143         PID of child process matching @comm is returned. Since this method is
    144         typically called when processes are getting killed/re-spawned, lets
    145         try looking up the PID up to 10 times if there were errors.
    146 
    147         @param comm: Name of the process whose PID needs to be fetched.
    148         @param parent: Name of @comm's parent process. This parameter can be
    149                        None.
    150 
    151         @returns PID of matching process.
    152 
    153         @raises error.TestFail exception if PID for @comm is not found.
    154         """
    155         for _ in range(10):
    156             try:
    157                 if parent:
    158                     ppid = self._client_run_output('ps -C %s -o pid=' % parent)
    159                     pid_list = self._client_run_output('ps --ppid %s -o pid= -o comm=' %
    160                                                        ppid).splitlines()
    161                     for line in pid_list:
    162                         pair = line.split()
    163                         pid = pair[0]
    164                         new_comm = pair[1]
    165                         if comm == new_comm:
    166                             break
    167                     if comm != new_comm:
    168                         logging.info("comm mismatch: %s != %s", comm, new_comm)
    169                         time.sleep(1)
    170                         continue
    171                 else:
    172                     pid = self._client_run_output('ps -C %s -o pid=' % comm)
    173                 return pid
    174             except error.AutoservRunError as e:
    175                 logging.debug("AutotestRunError is: %s", e)
    176                 time.sleep(1)
    177         raise error.TestFail("Unable to get pid. comm = %s, parent = %s"
    178                              % (comm, parent))
    179 
    180     def _trigger_sysrq_x(self):
    181         self._run_client_command('echo x > /proc/sysrq-trigger')
    182 
    183     def _test_sysrq_x(self):
    184         """
    185         Test sysrq-x.
    186 
    187         To help debug system hangs, we ask users to invoke alt-volume_up-x
    188         key combination. The kernel sysrq-x handler is what handles the
    189         alt-volume_up-x key combination. The sysrq-x handler in the kernel
    190         does the following for successive sysrq-x invocations within a 20
    191         second interval:
    192         1. Abort the chrome process whose parent is the session_manager process.
    193         2. Abort the X process. On Freon enabled systems, X is no longer present
    194            so this step is a no-op.
    195         3. Panic the kernel.
    196         This function tests the above steps.
    197         """
    198         for process, parent in [('chrome', 'session_manager'),
    199                                 ('X', None)]:
    200             if process is 'X':
    201                 # With Freon there is no longer an X process. Lets send the
    202                 # sysrq_x and then continue on.
    203                 self._trigger_sysrq_x()
    204                 continue
    205             orig_pid = self._get_pid(process, parent)
    206             self._trigger_sysrq_x()
    207             for _ in range(10):
    208                 new_pid = self._get_pid(process, parent)
    209                 logging.info("%s's original pid was %s and new pid is %s",
    210                               process, orig_pid, new_pid)
    211                 if new_pid != orig_pid:
    212                     break
    213                 time.sleep(1)
    214             else:
    215                 raise error.TestFail('%s did not restart on sysrq-x' % process)
    216 
    217         boot_id = self.client.get_boot_id()
    218         trigger = 'sysrq-x'
    219         text = 'sysrq_handle_cros_xkey'
    220         self._trigger_sysrq_x()
    221         self._wait_for_restart_and_check(boot_id, trigger, text)
    222 
    223     def _test_panic_path(self, lkdtm, kcrash_tuple):
    224         """
    225         Test the kernel panic paths.
    226         """
    227 
    228         # Figure out which kernel crash interface is available.
    229         interface = "/sys/kernel/debug/provoke-crash/DIRECT"
    230         trigger = lkdtm
    231         breakme, timeout, all_cpu, text = kcrash_tuple
    232         if not self._exists_on_client(interface):
    233             interface = "/proc/breakme"
    234             trigger = breakme
    235             logging.info("Falling back to %s", interface)
    236 
    237         # Find out how many cpus we have
    238         client_no_cpus = int(
    239             self.client.run('cat /proc/cpuinfo | grep processor | wc -l')
    240                             .stdout.strip())
    241         no_cpus = 1
    242 
    243         # Skip any triggers that are undefined for the given interface.
    244         if trigger == None:
    245             logging.info("Skipping unavailable trigger %s", lkdtm)
    246             return
    247         if lkdtm == "HARDLOCKUP":
    248             # ARM systems do not (presently) have NMI, so skip them for now.
    249             arch = self.client.get_arch()
    250             if arch.startswith('arm'):
    251                 logging.info("Skipping %s on architecture %s.",
    252                              trigger, arch)
    253                 return
    254             # Make sure a soft lockup detection doesn't get in the way.
    255             self.client.run("sysctl -w kernel.softlockup_panic=0")
    256 
    257         if trigger == "SPINLOCKUP":
    258             # This needs to be pre-triggered so the second one locks.
    259             self._provoke_crash(interface, trigger, None)
    260 
    261         if not all_cpu:
    262             no_cpus = 1
    263         else:
    264             no_cpus = client_no_cpus
    265         for cpu in range(no_cpus):
    266             # Always run on at least one cpu
    267             # Delete crash results, if any
    268             self.client.run('rm -f %s/*' % self._crash_log_dir)
    269             boot_id = self.client.get_boot_id()
    270             # This should cause target reset.
    271             # Run on a specific cpu if we're running on all of them,
    272             # otherwise run normally
    273             if all_cpu :
    274                 self._provoke_crash(interface, trigger, cpu)
    275             else:
    276                 self._provoke_crash(interface, trigger, None)
    277             self._wait_for_restart_and_check(boot_id, trigger, text,
    278                                              cpu=cpu, timeout=timeout)
    279 
    280     def run_once(self, kcrashes, host=None):
    281         self.client = host
    282         self._enable_consent()
    283         self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR
    284 
    285         # kcrash data is given by a dictionary with key lkdtm string to write
    286         # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict
    287         # value is a tuple containing 1) the string to write to /proc/breakme.
    288         # if lkdtm is not available, 2) the timeout, and 3)whether we run
    289         # the tests on all CPUs or not. Some tests take less to run than other
    290         # (null pointer and panic) so it would be best if we would run them on
    291         # all the CPUS as it wouldn't add that much time to the total.
    292         # The final component is the crash report string to look for in the
    293         # crash dump after target restarts.
    294         kcrash_types = {
    295             'BUG' : ('bug', 10, False, 'kernel BUG at'),
    296             'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'),
    297             'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'),
    298             'HARDLOCKUP' : ('nmiwatchdog', 50, False,
    299                             'Watchdog detected hard LOCKUP'),
    300             'SPINLOCKUP' : (None, 25, False, ('softlockup: hung tasks',
    301                                               'BUG: scheduling while atomic')),
    302             'EXCEPTION' : ('nullptr',     10, True,
    303              # x86 gives "BUG: unable to" while ARM gives "Unableto".
    304                            'nable to handle kernel NULL pointer '
    305                            'dereference at'),
    306             'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'),
    307             'CORRUPT_STACK' : (None, 10, True,
    308                                'stack-protector: Kernel stack is '
    309                                'corrupted in:')
    310             }
    311 
    312         bad_kcrashes = []
    313 
    314         #Expected input is comma-delimited kcrashes string
    315         kcrash_list = kcrashes.split(',')
    316         if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list:
    317             self._test_sysrq_x()
    318             if 'SYSRQ_X' in kcrash_list:
    319                 kcrash_list.remove('SYSRQ_X')
    320             if 'ALL' in kcrash_list:
    321                 kcrash_list = kcrash_types.keys()
    322         for kcrash in kcrash_list:
    323             if kcrash_types.get(kcrash) == None:
    324                 bad_kcrashes.append(kcrash)
    325                 continue
    326             self._test_panic_path(kcrash,kcrash_types[kcrash])
    327 
    328         if len(bad_kcrashes) > 0:
    329             raise error.TestFail("Wrong kcrash type "
    330                                  "requested (%s)" % str(bad_kcrashes))
    331 
    332     def cleanup(self):
    333         self._restore_consent_files()
    334         test.test.cleanup(self)
    335