Home | History | Annotate | Download | only in cros
      1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import logging, os, re, shutil, time
      6 
      7 from autotest_lib.client.bin import utils
      8 from autotest_lib.client.common_lib import error
      9 from autotest_lib.client.cros import cros_logging, sys_power
     10 #pylint: disable=W0611
     11 from autotest_lib.client.cros import flimflam_test_path
     12 import flimflam
     13 
     14 class Suspender(object):
     15     """Class for suspend/resume measurements.
     16 
     17     Public attributes:
     18         disconnect_3G_time: Amount of seconds it took to disable 3G.
     19         successes[]: List of timing measurement dicts from successful suspends.
     20         failures[]: List of SuspendFailure exceptions from failed suspends.
     21         device_times[]: List of individual device suspend/resume time dicts.
     22 
     23     Public methods:
     24         suspend: Do a suspend/resume cycle. Return timing measurement dict.
     25 
     26     Private attributes:
     27         _logs: Array of /var/log/messages lines since start of suspend cycle.
     28         _log_file: Open file descriptor at the end of /var/log/messages.
     29         _logdir: Directory to store firmware logs in case of errors.
     30         _suspend: Set to the sys_power suspend function to use.
     31         _throw: Set to have SuspendFailure exceptions raised to the caller.
     32         _reset_pm_print_times: Set to deactivate pm_print_times after the test.
     33         _restart_tlsdated: Set to restart tlsdated after the test.
     34 
     35     Private methods:
     36         __init__: Shuts off tlsdated for duration of test, disables 3G
     37         __del__: Restore tlsdated (must run eventually, but GC delay no problem)
     38         _set_pm_print_times: Enable/disable kernel device suspend timing output.
     39         _check_failure_log: Check /sys/.../suspend_stats for new failures.
     40         _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings
     41         _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume
     42         _device_resume_time: Read seconds overall device resume took from logs.
     43         _individual_device_times: Reads individual device suspend/resume times.
     44         _identify_driver: Return the driver name of a device (or "unknown").
     45     """
     46 
     47     # board-specific "time to suspend" values determined empirically
     48     # TODO: migrate to separate file with http://crosbug.com/38148
     49     _DEFAULT_SUSPEND_DELAY = 5
     50     _SUSPEND_DELAY = {
     51         # TODO: Reevaluate this when http://crosbug.com/38460 is fixed
     52         'daisy': 6,
     53         'daisy_spring': 6,
     54         'peach_pit': 6,
     55 
     56         # TODO: Reevaluate these when http://crosbug.com/38225 is fixed
     57         'x86-mario': 6,
     58         'x86-alex': 5,
     59 
     60         # Lumpy and Stumpy need high values, because it seems to mitigate their
     61         # RTC interrupt problem. See http://crosbug.com/36004
     62         'lumpy': 5,
     63         'stumpy': 5,
     64 
     65         # RTS5209 card reader has a really bad staging driver, can take ~1 sec
     66         'butterfly': 4,
     67 
     68         # Hard disk sync and overall just slow
     69         'parrot': 8,
     70         'kiev': 9,
     71     }
     72 
     73     # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts
     74     _ALARM_FORCE_EARLY_WAKEUP = 2147483647
     75 
     76     # File written by send_metrics_on_resume containing timing information about
     77     # the last resume.
     78     _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings'
     79 
     80     # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be
     81     # enough to include ACPI Wake Reason... 10 should be far on the safe side.
     82     _RELEVANT_EVENTLOG_LINES = 10
     83 
     84     # Sanity check value to catch overlong resume times (from missed RTC wakes)
     85     _MAX_RESUME_TIME = 10
     86 
     87     # File written by powerd_suspend containing the hwclock time at resume.
     88     HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume'
     89 
     90     def __init__(self, logdir, method=sys_power.do_suspend,
     91                  throw=False, device_times=False):
     92         """Prepare environment for suspending."""
     93         self.disconnect_3G_time = 0
     94         self.successes = []
     95         self.failures = []
     96         self._logdir = logdir
     97         self._suspend = method
     98         self._throw = throw
     99         self._reset_pm_print_times = False
    100         self._restart_tlsdated = False
    101         self._log_file = None
    102         if device_times:
    103             self.device_times = []
    104 
    105         # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves
    106         if utils.system_output('initctl status tlsdated').find('start') != -1:
    107             utils.system('initctl stop tlsdated')
    108             self._restart_tlsdated = True
    109             # give process's file descriptors time to asynchronously tear down
    110             time.sleep(0.1)
    111 
    112         # prime powerd_suspend RTC timestamp saving and make sure hwclock works
    113         utils.open_write_close(self.HWCLOCK_FILE, '')
    114         hwclock_output = utils.system_output('hwclock -r --debug --utc',
    115                                              ignore_status=True)
    116         if not re.search('Using.*/dev interface to.*clock', hwclock_output):
    117             raise error.TestError('hwclock cannot find rtc: ' + hwclock_output)
    118 
    119         # activate device suspend timing debug output
    120         if hasattr(self, 'device_times'):
    121             if not int(utils.read_one_line('/sys/power/pm_print_times')):
    122                 self._set_pm_print_times(True)
    123                 self._reset_pm_print_times = True
    124 
    125         # Shut down 3G to remove its variability from suspend time measurements
    126         flim = flimflam.FlimFlam()
    127         service = flim.FindCellularService(0)
    128         if service:
    129             logging.info('Found 3G interface, disconnecting.')
    130             start_time = time.time()
    131             (success, status) = flim.DisconnectService(
    132                     service=service, wait_timeout=60)
    133             if success:
    134                 logging.info('3G disconnected successfully.')
    135                 self.disconnect_3G_time = time.time() - start_time
    136             else:
    137                 logging.error('Could not disconnect: %s.', status)
    138                 self.disconnect_3G_time = -1
    139 
    140 
    141     def _set_pm_print_times(self, on):
    142         """Enable/disable extra suspend timing output from powerd to syslog."""
    143         if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)),
    144                 ignore_status=True):
    145             logging.warning('Failed to set pm_print_times to %s', bool(on))
    146             del self.device_times
    147             self._reset_pm_print_times = False
    148         else:
    149             logging.info('Device resume times set to %s', bool(on))
    150 
    151 
    152     def _get_board(self):
    153         """Remove _freon from get_board if found."""
    154 	return (utils.get_board().replace("_freon", ""))
    155 
    156 
    157     def _reset_logs(self):
    158         """Throw away cached log lines and reset log pointer to current end."""
    159         if self._log_file:
    160             self._log_file.close()
    161         self._log_file = open('/var/log/messages')
    162         self._log_file.seek(0, os.SEEK_END)
    163         self._logs = []
    164 
    165 
    166     def _update_logs(self, retries=11):
    167         """
    168         Read all lines logged since last reset into log cache. Block until last
    169         powerd_suspend resume message was read, raise if it takes too long.
    170         """
    171         finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished')
    172         for retry in xrange(retries + 1):
    173             lines = self._log_file.readlines()
    174             if lines:
    175                 if self._logs and self._logs[-1][-1] != '\n':
    176                     # Reassemble line that was cut in the middle
    177                     self._logs[-1] += lines.pop(0)
    178                 self._logs += lines
    179             for line in reversed(self._logs):
    180                 if (finished_regex.search(line)):
    181                     return
    182             time.sleep(0.005 * 2**retry)
    183 
    184         raise error.TestError("Sanity check failed: did not try to suspend.")
    185 
    186 
    187     def _ts(self, name, retries=11):
    188         """Searches logs for last timestamp with a given suspend message."""
    189         # Occasionally need to retry due to races from process wakeup order
    190         for retry in xrange(retries + 1):
    191             try:
    192                 f = open(self._TIMINGS_FILE)
    193                 for line in f:
    194                     words = line.split('=')
    195                     if name == words[0]:
    196                         try:
    197                             timestamp = float(words[1])
    198                         except ValueError:
    199                             logging.warning('Invalid timestamp: %s', line)
    200                             timestamp = 0
    201                         return timestamp
    202             except IOError:
    203                 pass
    204             time.sleep(0.005 * 2**retry)
    205 
    206         raise error.TestError('Could not find %s entry.' % name)
    207 
    208 
    209     def _hwclock_ts(self, not_before, retries=3):
    210         """Read the RTC resume timestamp saved by powerd_suspend."""
    211         for retry in xrange(retries + 1):
    212             early_wakeup = False
    213             if os.path.exists(self.HWCLOCK_FILE):
    214                 match = re.search(r'([0-9]+) seconds since .+ (-?[0-9.]+) sec',
    215                                   utils.read_file(self.HWCLOCK_FILE), re.DOTALL)
    216                 if match:
    217                     seconds = int(match.group(1)) + float(match.group(2))
    218                     logging.debug('RTC resume timestamp read: %f', seconds)
    219                     if seconds >= not_before:
    220                         return seconds
    221                     early_wakeup = True
    222             time.sleep(0.05 * retry)
    223         if early_wakeup:
    224             logging.debug('Early wakeup, dumping eventlog if it exists:\n')
    225             elog = utils.system_output('mosys eventlog list | tail -n %d' %
    226                     self._RELEVANT_EVENTLOG_LINES, ignore_status=True)
    227             wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1]
    228             for line in reversed(self._logs):
    229                 match = re.search(r'PM1_STS: WAK.*', line)
    230                 if match:
    231                     wake_syslog = match.group(0)
    232                     break
    233             else:
    234                 wake_syslog = 'unknown'
    235             for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST:
    236                 if (re.search(b, utils.get_board()) and
    237                         re.search(e, wake_elog) and re.search(s, wake_syslog)):
    238                     logging.warning('Whitelisted spurious wake in S3: %s | %s',
    239                                     wake_elog, wake_syslog)
    240                     return None
    241             raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s'
    242                     % (wake_elog, wake_syslog))
    243         if self._get_board() in ['lumpy', 'stumpy', 'kiev']:
    244             logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' +
    245                     utils.system_output('mosys nvram dump', ignore_status=True))
    246             return None
    247         raise error.TestError('Broken RTC timestamp: ' +
    248                               utils.read_file(self.HWCLOCK_FILE))
    249 
    250 
    251     def _firmware_resume_time(self):
    252         """Calculate seconds for firmware resume from logged TSC. (x86 only)"""
    253         if utils.get_arch() not in ['i686', 'x86_64']:
    254             # TODO: support this on ARM somehow
    255             return 0
    256         regex = re.compile(r'TSC at resume: (\d+)$')
    257         freq = 1000 * int(utils.read_one_line(
    258                 '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq'))
    259         for line in reversed(self._logs):
    260             match = regex.search(line)
    261             if match:
    262                 return float(match.group(1)) / freq
    263 
    264         raise error.TestError('Failed to find TSC resume value in syslog.')
    265 
    266 
    267     def _device_resume_time(self):
    268         """Read amount of seconds for overall device resume from syslog."""
    269         regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)')
    270         for line in reversed(self._logs):
    271             match = regex.search(line)
    272             if match:
    273                 return float(match.group(1)) / 1000
    274 
    275         raise error.TestError('Failed to find device resume time in syslog.')
    276 
    277 
    278     def _individual_device_times(self, start_resume):
    279         """Return dict of individual device suspend and resume times."""
    280         self.device_times.append(dict())
    281         regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs')
    282         for line in self._logs:
    283             match = regex.search(line)
    284             if match:
    285                 key = 'seconds_dev_' + match.group(1).replace(':', '-')
    286                 secs = float(match.group(2)) / 1e6
    287                 if cros_logging.extract_kernel_timestamp(line) > start_resume:
    288                     key += '_resume'
    289                 else:
    290                     key += '_suspend'
    291                 if key in self.device_times[-1]:
    292                     logging.warning('Duplicate entry for %s: +%f', key, secs)
    293                     self.device_times[-1][key] += secs
    294                 else:
    295                     logging.debug('%s: %f', key, secs)
    296                     self.device_times[-1][key] = secs
    297 
    298 
    299     def _identify_driver(self, device):
    300         """Return the driver name of a device (or "unknown")."""
    301         for path, subdirs, _ in os.walk('/sys/devices'):
    302             if device in subdirs:
    303                 node = os.path.join(path, device, 'driver')
    304                 if not os.path.exists(node):
    305                     return "unknown"
    306                 return os.path.basename(os.path.realpath(node))
    307         else:
    308             return "unknown"
    309 
    310 
    311     def _check_for_errors(self):
    312         """Find and identify suspend errors. Return True iff we should retry."""
    313         warning_regex = re.compile(r' kernel: \[.*WARNING:')
    314         abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort'
    315                 r'| powerd_suspend\[.*Cancel suspend at kernel'
    316                 r'| kernel: \[.*PM: Wakeup pending, aborting suspend')
    317         # rsyslogd can put this out of order with dmesg, so track in variable
    318         fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error')
    319         failed = False
    320 
    321         # TODO(scottz): warning_monitor crosbug.com/38092
    322         for i in xrange(len(self._logs)):
    323             line = self._logs[i]
    324             if warning_regex.search(line):
    325                 # match the source file from the WARNING line, and the
    326                 # actual error text by peeking one or two lines below that
    327                 src = cros_logging.strip_timestamp(line)
    328                 text = cros_logging.strip_timestamp(self._logs[i + 1]) + '\n' \
    329                      + cros_logging.strip_timestamp(self._logs[i + 2])
    330                 for p1, p2 in sys_power.KernelError.WHITELIST:
    331                     if re.search(p1, src) and re.search(p2, text):
    332                         logging.info('Whitelisted KernelError: %s', src)
    333                         break
    334                 else:
    335                     raise sys_power.KernelError("%s\n%s" % (src, text))
    336             if abort_regex.search(line):
    337                 wake_source = 'unknown'
    338                 match = re.search(r'last active wakeup source: (.*)$',
    339                         '\n'.join(self._logs[i-5:i+3]), re.MULTILINE)
    340                 if match:
    341                     wake_source = match.group(1)
    342                 driver = self._identify_driver(wake_source)
    343                 for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST:
    344                     if (re.search(b, utils.get_board()) and
    345                             re.search(w, wake_source)):
    346                         logging.warning('Whitelisted spurious wake before '
    347                                         'S3: %s | %s', wake_source, driver)
    348                         return True
    349                 if "rtc" in driver:
    350                     raise sys_power.SuspendTimeout('System took too '
    351                                                    'long to suspend.')
    352                 raise sys_power.SpuriousWakeupError('Spurious wake '
    353                         'before S3: %s | %s' % (wake_source, driver))
    354             if fail_regex.search(line):
    355                 failed = True
    356         if failed:
    357             raise sys_power.SuspendFailure('Unidentified problem.')
    358         return False
    359 
    360 
    361     def suspend(self, duration=10):
    362         """
    363         Do a single suspend for 'duration' seconds. Estimates the amount of time
    364         it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC
    365         wakeup delay will be longer. Returns None on errors, or raises the
    366         exception when _throw is set. Returns a dict of general measurements,
    367         or a tuple (general_measurements, individual_device_times) when
    368         _device_times is set.
    369 
    370         @param duration: time in seconds to do a suspend prior to waking.
    371         """
    372         try:
    373             iteration = len(self.failures) + len(self.successes) + 1
    374 
    375             # Retry suspend in case we hit a known (whitelisted) bug
    376             for _ in xrange(10):
    377                 self._reset_logs()
    378                 utils.system('sync')
    379                 board_delay = self._SUSPEND_DELAY.get(self._get_board(),
    380                         self._DEFAULT_SUSPEND_DELAY)
    381                 try:
    382                     alarm = self._suspend(duration + board_delay)
    383                 except sys_power.SpuriousWakeupError:
    384                     # might be another error, we check for it ourselves below
    385                     alarm = self._ALARM_FORCE_EARLY_WAKEUP
    386 
    387                 if os.path.exists('/sys/firmware/log'):
    388                     for msg in re.findall(r'^.*ERROR.*$',
    389                             utils.read_file('/sys/firmware/log'), re.M):
    390                         for board, pattern in sys_power.FirmwareError.WHITELIST:
    391                             if (re.search(board, utils.get_board()) and
    392                                     re.search(pattern, msg)):
    393                                 logging.info('Whitelisted FW error: ' + msg)
    394                                 break
    395                         else:
    396                             firmware_log = os.path.join(self._logdir,
    397                                     'firmware.log.' + str(iteration))
    398                             shutil.copy('/sys/firmware/log', firmware_log)
    399                             logging.info('Saved firmware log: ' + firmware_log)
    400                             raise sys_power.FirmwareError(msg.strip('\r\n '))
    401 
    402                 self._update_logs()
    403                 if not self._check_for_errors():
    404                     hwclock_ts = self._hwclock_ts(alarm)
    405                     if hwclock_ts:
    406                         break
    407             else:
    408                 raise error.TestWarn('Ten tries failed due to whitelisted bug')
    409 
    410             # calculate general measurements
    411             start_resume = self._ts('start_resume_time')
    412             kernel_down = (self._ts('end_suspend_time') -
    413                            self._ts('start_suspend_time'))
    414             kernel_up = self._ts('end_resume_time') - start_resume
    415             devices_up = self._device_resume_time()
    416             total_up = hwclock_ts - alarm
    417             firmware_up = self._firmware_resume_time()
    418             board_up = total_up - kernel_up - firmware_up
    419             try:
    420                 cpu_up = self._ts('cpu_ready_time', 0) - start_resume
    421             except error.TestError:
    422                 # can be missing on non-SMP machines
    423                 cpu_up = None
    424             if total_up > self._MAX_RESUME_TIME:
    425                 raise error.TestError('Sanity check failed: missed RTC wakeup.')
    426 
    427             logging.info('Success(%d): %g down, %g up, %g board, %g firmware, '
    428                          '%g kernel, %g cpu, %g devices',
    429                          iteration, kernel_down, total_up, board_up,
    430                          firmware_up, kernel_up, cpu_up, devices_up)
    431             self.successes.append({
    432                 'seconds_system_suspend': kernel_down,
    433                 'seconds_system_resume': total_up,
    434                 'seconds_system_resume_firmware': firmware_up + board_up,
    435                 'seconds_system_resume_firmware_cpu': firmware_up,
    436                 'seconds_system_resume_firmware_ec': board_up,
    437                 'seconds_system_resume_kernel': kernel_up,
    438                 'seconds_system_resume_kernel_cpu': cpu_up,
    439                 'seconds_system_resume_kernel_dev': devices_up,
    440                 })
    441 
    442             if hasattr(self, 'device_times'):
    443                 self._individual_device_times(start_resume)
    444                 return (self.successes[-1], self.device_times[-1])
    445             else:
    446                 return self.successes[-1]
    447 
    448         except sys_power.SuspendFailure as ex:
    449             message = '%s(%d): %s' % (type(ex).__name__, iteration, ex)
    450             logging.error(message)
    451             self.failures.append(ex)
    452             if self._throw:
    453                 if type(ex).__name__ in ['KernelError', 'SuspendTimeout']:
    454                     raise error.TestWarn(message)
    455                 else:
    456                     raise error.TestFail(message)
    457             return None
    458 
    459 
    460     def finalize(self):
    461         """Restore normal environment (not turning 3G back on for now...)"""
    462         if os.path.exists(self.HWCLOCK_FILE):
    463             os.remove(self.HWCLOCK_FILE)
    464             if self._restart_tlsdated:
    465                 utils.system('initctl start tlsdated')
    466             if self._reset_pm_print_times:
    467                 self._set_pm_print_times(False)
    468 
    469 
    470     def __del__(self):
    471         self.finalize()
    472