1 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import logging, os, time 6 7 from autotest_lib.client.common_lib import error 8 from autotest_lib.client.cros import constants 9 from autotest_lib.client.cros.crash.crash_test import CrashTest as CrashTestDefs 10 from autotest_lib.server import test 11 12 class platform_KernelErrorPaths(test.test): 13 """Performs various kernel crash tests and makes sure that the expected 14 results are found in the crash report.""" 15 version = 1 16 17 def _run_client_command(self, command): 18 try: 19 # Simply sending the trigger into lkdtm resets the target 20 # immediately, leaving files unsaved to disk and the master ssh 21 # connection wedged for a long time. 22 self.client.run( 23 'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command) 24 except error.AutoservRunError, e: 25 # It is expected that this will cause a non-zero exit status. 26 pass 27 28 def _provoke_crash(self, interface, trigger, cpu): 29 """ 30 This test is ensuring that the machine will reboot on any 31 type of kernel panic. If the sysctls below are not set 32 correctly, the machine will not reboot. After verifying 33 that the machine has the proper sysctl state, we make it 34 reboot by writing to lkdtm. 35 36 @param interface: which filesystem interface to write into 37 @param trigger: the text string to write for triggering a crash 38 @param cpu: None or a specific cpu number to pin before crashing 39 """ 40 self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"'); 41 self.client.run('sysctl kernel.panic_on_oops|' 42 'grep "kernel.panic_on_oops = 1"'); 43 44 if cpu != None: 45 # Run on a specific CPU using taskset 46 command = "echo %s | taskset -c %d tee %s" % (trigger, cpu, 47 interface) 48 else: 49 # Run normally 50 command = "echo %s > %s" % (trigger, interface) 51 52 logging.info("KernelErrorPaths: executing '%s' on %s", 53 command, self.client.hostname) 54 self._run_client_command(command) 55 56 def _exists_on_client(self, f): 57 return self.client.run('ls "%s"' % f, 58 ignore_status=True).exit_status == 0 59 60 def _enable_consent(self): 61 """ Enable consent so that crashes get stored in /var/spool/crash. """ 62 self._consent_files = [ 63 (CrashTestDefs._PAUSE_FILE, None, 'chronos'), 64 (CrashTestDefs._CONSENT_FILE, None, 'chronos'), 65 (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'), 66 (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'), 67 ] 68 for dst, src, owner in self._consent_files: 69 if self._exists_on_client(dst): 70 self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst)) 71 if src: 72 full_src = os.path.join(self.autodir, 'client/cros', src) 73 self.client.send_file(full_src, dst) 74 else: 75 self.client.run('touch "%s"' % dst) 76 self.client.run('chown "%s" "%s"' % (owner, dst)) 77 78 def _restore_consent_files(self): 79 """ Restore consent files to their previous values. """ 80 for f, _, _ in self._consent_files: 81 self.client.run('rm -f "%s"' % f) 82 if self._exists_on_client('%s.autotest_backup' % f): 83 self.client.run('mv "%s.autotest_backup" "%s"' % (f, f)) 84 85 def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0, 86 timeout=10): 87 """ 88 Wait for panic reboot to complete and check @text in kcrash file. 89 90 @param bootid: Boot ID of the current boot. 91 @param trigger: Text string that specifies what caused the panic/reboot. 92 @param text: Text string to match in the kcrash file. 93 @param cpu: CPU on which the trigger happened. 94 @param timeout: Time to wait for the remote host to go down. 95 96 @raises error.TestFail if the @text string is not found in kcrash file. 97 """ 98 try: 99 self.client.wait_for_restart( 100 down_timeout=timeout, 101 down_warning=timeout, 102 old_boot_id=boot_id, 103 # Extend the default reboot timeout as some targets take 104 # longer than normal before ssh is available again. 105 timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4) 106 except error.AutoservShutdownError: 107 self.client.run('ps alx') 108 raise 109 110 # give the crash_reporter some time to log the crash 111 time.sleep(5) 112 113 # check if dir /var/spool/crash exists on client or not 114 if not self._exists_on_client(self._crash_log_dir): 115 raise error.TestFail( 116 '%s does not exists on client' % self._crash_log_dir) 117 118 # check if kernel.*.kcrash files are on the client or not 119 kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir 120 if not self.client.list_files_glob(kcrash_file_path): 121 raise error.TestFail('No kcrash files found on client') 122 123 result = self.client.run('cat %s/kernel.*.kcrash' % 124 self._crash_log_dir) 125 if not type(text) == tuple: 126 match = (text,) 127 else: 128 match = text 129 if not any(s in result.stdout for s in match): 130 raise error.TestFail( 131 "'%s' not found in log after sending '%s' on cpu %d" % 132 ((match,), trigger, cpu)) 133 134 def _client_run_output(self, cmd): 135 return self.client.run(cmd).stdout.strip() 136 137 def _get_pid(self, comm, parent): 138 """ 139 Fetch PID of process named comm. 140 141 This function tries to lookup the PID for process named @comm. If 142 @parent is not None, the parent process is first looked up and then the 143 PID of child process matching @comm is returned. Since this method is 144 typically called when processes are getting killed/re-spawned, lets 145 try looking up the PID up to 10 times if there were errors. 146 147 @param comm: Name of the process whose PID needs to be fetched. 148 @param parent: Name of @comm's parent process. This parameter can be 149 None. 150 151 @returns PID of matching process. 152 153 @raises error.TestFail exception if PID for @comm is not found. 154 """ 155 for _ in range(10): 156 try: 157 if parent: 158 ppid = self._client_run_output('ps -C %s -o pid=' % parent) 159 pid_list = self._client_run_output('ps --ppid %s -o pid= -o comm=' % 160 ppid).splitlines() 161 for line in pid_list: 162 pair = line.split() 163 pid = pair[0] 164 new_comm = pair[1] 165 if comm == new_comm: 166 break 167 if comm != new_comm: 168 logging.info("comm mismatch: %s != %s", comm, new_comm) 169 time.sleep(1) 170 continue 171 else: 172 pid = self._client_run_output('ps -C %s -o pid=' % comm) 173 return pid 174 except error.AutoservRunError as e: 175 logging.debug("AutotestRunError is: %s", e) 176 time.sleep(1) 177 raise error.TestFail("Unable to get pid. comm = %s, parent = %s" 178 % (comm, parent)) 179 180 def _trigger_sysrq_x(self): 181 self._run_client_command('echo x > /proc/sysrq-trigger') 182 183 def _test_sysrq_x(self): 184 """ 185 Test sysrq-x. 186 187 To help debug system hangs, we ask users to invoke alt-volume_up-x 188 key combination. The kernel sysrq-x handler is what handles the 189 alt-volume_up-x key combination. The sysrq-x handler in the kernel 190 does the following for successive sysrq-x invocations within a 20 191 second interval: 192 1. Abort the chrome process whose parent is the session_manager process. 193 2. Abort the X process. On Freon enabled systems, X is no longer present 194 so this step is a no-op. 195 3. Panic the kernel. 196 This function tests the above steps. 197 """ 198 for process, parent in [('chrome', 'session_manager'), 199 ('X', None)]: 200 if process is 'X': 201 # With Freon there is no longer an X process. Lets send the 202 # sysrq_x and then continue on. 203 self._trigger_sysrq_x() 204 continue 205 orig_pid = self._get_pid(process, parent) 206 self._trigger_sysrq_x() 207 for _ in range(10): 208 new_pid = self._get_pid(process, parent) 209 logging.info("%s's original pid was %s and new pid is %s", 210 process, orig_pid, new_pid) 211 if new_pid != orig_pid: 212 break 213 time.sleep(1) 214 else: 215 raise error.TestFail('%s did not restart on sysrq-x' % process) 216 217 boot_id = self.client.get_boot_id() 218 trigger = 'sysrq-x' 219 text = 'sysrq_handle_cros_xkey' 220 self._trigger_sysrq_x() 221 self._wait_for_restart_and_check(boot_id, trigger, text) 222 223 def _test_panic_path(self, lkdtm, kcrash_tuple): 224 """ 225 Test the kernel panic paths. 226 """ 227 228 # Figure out which kernel crash interface is available. 229 interface = "/sys/kernel/debug/provoke-crash/DIRECT" 230 trigger = lkdtm 231 breakme, timeout, all_cpu, text = kcrash_tuple 232 if not self._exists_on_client(interface): 233 interface = "/proc/breakme" 234 trigger = breakme 235 logging.info("Falling back to %s", interface) 236 237 # Find out how many cpus we have 238 client_no_cpus = int( 239 self.client.run('cat /proc/cpuinfo | grep processor | wc -l') 240 .stdout.strip()) 241 no_cpus = 1 242 243 # Skip any triggers that are undefined for the given interface. 244 if trigger == None: 245 logging.info("Skipping unavailable trigger %s", lkdtm) 246 return 247 if lkdtm == "HARDLOCKUP": 248 # ARM systems do not (presently) have NMI, so skip them for now. 249 arch = self.client.get_arch() 250 if arch.startswith('arm'): 251 logging.info("Skipping %s on architecture %s.", 252 trigger, arch) 253 return 254 # Make sure a soft lockup detection doesn't get in the way. 255 self.client.run("sysctl -w kernel.softlockup_panic=0") 256 257 if trigger == "SPINLOCKUP": 258 # This needs to be pre-triggered so the second one locks. 259 self._provoke_crash(interface, trigger, None) 260 261 if not all_cpu: 262 no_cpus = 1 263 else: 264 no_cpus = client_no_cpus 265 for cpu in range(no_cpus): 266 # Always run on at least one cpu 267 # Delete crash results, if any 268 self.client.run('rm -f %s/*' % self._crash_log_dir) 269 boot_id = self.client.get_boot_id() 270 # This should cause target reset. 271 # Run on a specific cpu if we're running on all of them, 272 # otherwise run normally 273 if all_cpu : 274 self._provoke_crash(interface, trigger, cpu) 275 else: 276 self._provoke_crash(interface, trigger, None) 277 self._wait_for_restart_and_check(boot_id, trigger, text, 278 cpu=cpu, timeout=timeout) 279 280 def run_once(self, kcrashes, host=None): 281 self.client = host 282 self._enable_consent() 283 self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR 284 285 # kcrash data is given by a dictionary with key lkdtm string to write 286 # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict 287 # value is a tuple containing 1) the string to write to /proc/breakme. 288 # if lkdtm is not available, 2) the timeout, and 3)whether we run 289 # the tests on all CPUs or not. Some tests take less to run than other 290 # (null pointer and panic) so it would be best if we would run them on 291 # all the CPUS as it wouldn't add that much time to the total. 292 # The final component is the crash report string to look for in the 293 # crash dump after target restarts. 294 kcrash_types = { 295 'BUG' : ('bug', 10, False, 'kernel BUG at'), 296 'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'), 297 'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'), 298 'HARDLOCKUP' : ('nmiwatchdog', 50, False, 299 'Watchdog detected hard LOCKUP'), 300 'SPINLOCKUP' : (None, 25, False, ('softlockup: hung tasks', 301 'BUG: scheduling while atomic')), 302 'EXCEPTION' : ('nullptr', 10, True, 303 # x86 gives "BUG: unable to" while ARM gives "Unableto". 304 'nable to handle kernel NULL pointer ' 305 'dereference at'), 306 'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'), 307 'CORRUPT_STACK' : (None, 10, True, 308 'stack-protector: Kernel stack is ' 309 'corrupted in:') 310 } 311 312 bad_kcrashes = [] 313 314 #Expected input is comma-delimited kcrashes string 315 kcrash_list = kcrashes.split(',') 316 if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list: 317 self._test_sysrq_x() 318 if 'SYSRQ_X' in kcrash_list: 319 kcrash_list.remove('SYSRQ_X') 320 if 'ALL' in kcrash_list: 321 kcrash_list = kcrash_types.keys() 322 for kcrash in kcrash_list: 323 if kcrash_types.get(kcrash) == None: 324 bad_kcrashes.append(kcrash) 325 continue 326 self._test_panic_path(kcrash,kcrash_types[kcrash]) 327 328 if len(bad_kcrashes) > 0: 329 raise error.TestFail("Wrong kcrash type " 330 "requested (%s)" % str(bad_kcrashes)) 331 332 def cleanup(self): 333 self._restore_consent_files() 334 test.test.cleanup(self) 335