普通文本  |  580行  |  25.36 KB

# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import collections, logging, os, re, shutil, time

from autotest_lib.client.bin import utils
from autotest_lib.client.common_lib import base_utils, error
from autotest_lib.client.cros import cros_logging, sys_power
from autotest_lib.client.cros import power_utils
from autotest_lib.client.cros import power_status
#pylint: disable=W0611
from autotest_lib.client.cros import flimflam_test_path
import flimflam

class Suspender(object):
    """Class for suspend/resume measurements.

    Public attributes:
        disconnect_3G_time: Amount of seconds it took to disable 3G.
        successes[]: List of timing measurement dicts from successful suspends.
        failures[]: List of SuspendFailure exceptions from failed suspends.
        device_times[]: List of individual device suspend/resume time dicts.

    Public methods:
        suspend: Do a suspend/resume cycle. Return timing measurement dict.

    Private attributes:
        _logs: Array of /var/log/messages lines since start of suspend cycle.
        _log_file: Open file descriptor at the end of /var/log/messages.
        _logdir: Directory to store firmware logs in case of errors.
        _suspend: Set to the sys_power suspend function to use.
        _throw: Set to have SuspendFailure exceptions raised to the caller.
        _reset_pm_print_times: Set to deactivate pm_print_times after the test.
        _restart_tlsdated: Set to restart tlsdated after the test.

    Private methods:
        __init__: Shuts off tlsdated for duration of test, disables 3G
        __del__: Restore tlsdated (must run eventually, but GC delay no problem)
        _set_pm_print_times: Enable/disable kernel device suspend timing output.
        _check_failure_log: Check /sys/.../suspend_stats for new failures.
        _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings
        _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume
        _device_resume_time: Read seconds overall device resume took from logs.
        _individual_device_times: Reads individual device suspend/resume times.
        _identify_driver: Return the driver name of a device (or "unknown").
    """

    # board-specific "time to suspend" values determined empirically
    # TODO: migrate to separate file with http://crosbug.com/38148
    _DEFAULT_SUSPEND_DELAY = 5
    _SUSPEND_DELAY = {
        # TODO: Reevaluate this when http://crosbug.com/38460 is fixed
        'daisy': 6,
        'daisy_spring': 6,
        'peach_pit': 6,

        # TODO: Reevaluate these when http://crosbug.com/38225 is fixed
        'x86-mario': 6,
        'x86-alex': 5,

        # Lumpy and Stumpy need high values, because it seems to mitigate their
        # RTC interrupt problem. See http://crosbug.com/36004
        'lumpy': 5,
        'stumpy': 5,

        # RTS5209 card reader has a really bad staging driver, can take ~1 sec
        'butterfly': 4,

        # Hard disk sync and overall just slow
        'parrot': 8,
        'kiev': 9,
    }

    # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts
    _ALARM_FORCE_EARLY_WAKEUP = 2147483647

    # File written by send_metrics_on_resume containing timing information about
    # the last resume.
    _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings'

    # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be
    # enough to include ACPI Wake Reason... 10 should be far on the safe side.
    _RELEVANT_EVENTLOG_LINES = 10

    # Sanity check value to catch overlong resume times (from missed RTC wakes)
    _MAX_RESUME_TIME = 10

    # File written by powerd_suspend containing the hwclock time at resume.
    HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume'

    # File read by powerd to decide on the state to suspend (mem or freeze).
    _SUSPEND_STATE_PREF_FILE = 'suspend_to_idle'

    def __init__(self, logdir, method=sys_power.do_suspend,
                 throw=False, device_times=False, suspend_state=''):
        """
        Prepare environment for suspending.
        @param suspend_state: Suspend state to enter into. It can be
                              'mem' or 'freeze' or an empty string. If
                              the suspend state is an empty string,
                              system suspends to the default pref.
        """
        self.disconnect_3G_time = 0
        self.successes = []
        self.failures = []
        self._logdir = logdir
        self._suspend = method
        self._throw = throw
        self._reset_pm_print_times = False
        self._restart_tlsdated = False
        self._log_file = None
        self._suspend_state = suspend_state
        if device_times:
            self.device_times = []

        # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves
        if utils.system_output('initctl status tlsdated').find('start') != -1:
            utils.system('initctl stop tlsdated')
            self._restart_tlsdated = True
            # give process's file descriptors time to asynchronously tear down
            time.sleep(0.1)

        # prime powerd_suspend RTC timestamp saving and make sure hwclock works
        utils.open_write_close(self.HWCLOCK_FILE, '')
        hwclock_output = utils.system_output('hwclock -r --debug --utc',
                                             ignore_status=True)
        if not re.search('Using.*/dev interface to.*clock', hwclock_output):
            raise error.TestError('hwclock cannot find rtc: ' + hwclock_output)

        # activate device suspend timing debug output
        if hasattr(self, 'device_times'):
            if not int(utils.read_one_line('/sys/power/pm_print_times')):
                self._set_pm_print_times(True)
                self._reset_pm_print_times = True

        # Shut down 3G to remove its variability from suspend time measurements
        flim = flimflam.FlimFlam()
        service = flim.FindCellularService(0)
        if service:
            logging.info('Found 3G interface, disconnecting.')
            start_time = time.time()
            (success, status) = flim.DisconnectService(
                    service=service, wait_timeout=60)
            if success:
                logging.info('3G disconnected successfully.')
                self.disconnect_3G_time = time.time() - start_time
            else:
                logging.error('Could not disconnect: %s.', status)
                self.disconnect_3G_time = -1

        self._configure_suspend_state()

    def _configure_suspend_state(self):
        """Configure the suspend state as requested."""
        if self._suspend_state:
            available_suspend_states = utils.read_one_line('/sys/power/state')
            if self._suspend_state not in available_suspend_states:
                raise error.TestNAError('Invalid suspend state: ' +
                                        self._suspend_state)
            # Check the current state. If it is same as the one requested,
            # we don't want to call PowerPrefChanger(restarts powerd).
            if self._suspend_state == power_utils.get_sleep_state():
                return
            should_freeze = '1' if self._suspend_state == 'freeze' else '0'
            new_prefs = {self._SUSPEND_STATE_PREF_FILE: should_freeze}
            self._power_pref_changer = power_utils.PowerPrefChanger(new_prefs)

    def _set_pm_print_times(self, on):
        """Enable/disable extra suspend timing output from powerd to syslog."""
        if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)),
                ignore_status=True):
            logging.warning('Failed to set pm_print_times to %s', bool(on))
            del self.device_times
            self._reset_pm_print_times = False
        else:
            logging.info('Device resume times set to %s', bool(on))


    def _get_board(self):
        """Remove _freon from get_board if found."""
        return (utils.get_board().replace("_freon", ""))


    def _reset_logs(self):
        """Throw away cached log lines and reset log pointer to current end."""
        if self._log_file:
            self._log_file.close()
        self._log_file = open('/var/log/messages')
        self._log_file.seek(0, os.SEEK_END)
        self._logs = []


    def _update_logs(self, retries=11):
        """
        Read all lines logged since last reset into log cache. Block until last
        powerd_suspend resume message was read, raise if it takes too long.
        """
        finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished')
        for retry in xrange(retries + 1):
            lines = self._log_file.readlines()
            if lines:
                if self._logs and self._logs[-1][-1] != '\n':
                    # Reassemble line that was cut in the middle
                    self._logs[-1] += lines.pop(0)
                self._logs += lines
            for line in reversed(self._logs):
                if (finished_regex.search(line)):
                    return
            time.sleep(0.005 * 2**retry)

        raise error.TestError("Sanity check failed: did not try to suspend.")


    def _ts(self, name, retries=11):
        """Searches logs for last timestamp with a given suspend message."""
        # Occasionally need to retry due to races from process wakeup order
        for retry in xrange(retries + 1):
            try:
                f = open(self._TIMINGS_FILE)
                for line in f:
                    words = line.split('=')
                    if name == words[0]:
                        try:
                            timestamp = float(words[1])
                        except ValueError:
                            logging.warning('Invalid timestamp: %s', line)
                            timestamp = 0
                        return timestamp
            except IOError:
                pass
            time.sleep(0.005 * 2**retry)

        raise error.TestError('Could not find %s entry.' % name)


    def _hwclock_ts(self, not_before, retries=3):
        """Read the RTC resume timestamp saved by powerd_suspend."""
        for retry in xrange(retries + 1):
            early_wakeup = False
            if os.path.exists(self.HWCLOCK_FILE):
                match = re.search(r'(.+\w)\s+(-?[0-9.]+) seconds',
                                  utils.read_file(self.HWCLOCK_FILE), re.DOTALL)
                if match:
                    timeval = time.strptime(match.group(1),
                            "%a %b %d %H:%M:%S %Y")
                    seconds = time.mktime(timeval)
                    seconds += float(match.group(2))
                    logging.debug('RTC resume timestamp read: %f', seconds)
                    if seconds >= not_before:
                        return seconds
                    early_wakeup = True
            time.sleep(0.05 * retry)
        if early_wakeup:
            logging.debug('Early wakeup, dumping eventlog if it exists:\n')
            elog = utils.system_output('mosys eventlog list | tail -n %d' %
                    self._RELEVANT_EVENTLOG_LINES, ignore_status=True)
            wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1]
            for line in reversed(self._logs):
                match = re.search(r'PM1_STS: WAK.*', line)
                if match:
                    wake_syslog = match.group(0)
                    break
            else:
                wake_syslog = 'unknown'
            for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST:
                if (re.search(b, utils.get_board()) and
                        re.search(e, wake_elog) and re.search(s, wake_syslog)):
                    logging.warning('Whitelisted spurious wake in S3: %s | %s',
                                    wake_elog, wake_syslog)
                    return None
            raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s'
                    % (wake_elog, wake_syslog))
        if self._get_board() in ['lumpy', 'stumpy', 'kiev']:
            logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' +
                    utils.system_output('mosys nvram dump', ignore_status=True))
            return None
        raise error.TestError('Broken RTC timestamp: ' +
                              utils.read_file(self.HWCLOCK_FILE))


    def _firmware_resume_time(self):
        """Calculate seconds for firmware resume from logged TSC. (x86 only)"""
        if utils.get_arch() not in ['i686', 'x86_64']:
            # TODO: support this on ARM somehow
            return 0
        regex_freeze = re.compile(r'PM: resume from suspend-to-idle')
        regex_tsc = re.compile(r'TSC at resume: (\d+)$')
        freq = 1000 * int(utils.read_one_line(
                '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq'))
        for line in reversed(self._logs):
            match_freeze = regex_freeze.search(line)
            if match_freeze:
                logging.info('fw resume time zero due to suspend-to-idle\n')
                return 0
            match_tsc = regex_tsc.search(line)
            if match_tsc:
                return float(match_tsc.group(1)) / freq

        raise error.TestError('Failed to find TSC resume value in syslog.')


    def _device_resume_time(self):
        """Read amount of seconds for overall device resume from syslog."""
        regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)')
        for line in reversed(self._logs):
            match = regex.search(line)
            if match:
                return float(match.group(1)) / 1000

        raise error.TestError('Failed to find device resume time in syslog.')

    def _get_phase_times(self):
        phase_times = []
        regex = re.compile(r'PM: (\w+ )?(resume|suspend) of devices complete')
        for line in self._logs:
          match = regex.search(line)
          if match:
            ts = cros_logging.extract_kernel_timestamp(line)
            phase = match.group(1)
            if not phase:
              phase = 'REG'
            phase_times.append((phase.upper(), ts))
        return sorted(phase_times, key = lambda entry: entry[1])

    def _get_phase(self, ts, phase_table, dev):
      for entry in phase_table:
        #checking if timestamp was before that phase's cutoff
        if ts < entry[1]:
          return entry[0]
      raise error.TestError('Device %s has a timestamp after all devices %s',
                            dev, 'had already resumed')

    def _individual_device_times(self, start_resume):
        """Return dict of individual device suspend and resume times."""
        self.device_times.append(dict())
        dev_details = collections.defaultdict(dict)
        regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs')
        phase_table = self._get_phase_times()
        for line in self._logs:
          match = regex.search(line)
          if match:
            device = match.group(1).replace(':', '-')
            key = 'seconds_dev_' + device
            secs = float(match.group(2)) / 1e6
            ts = cros_logging.extract_kernel_timestamp(line)
            if ts > start_resume:
              key += '_resume'
            else:
              key += '_suspend'
            #looking if we're in a special phase
            phase = self._get_phase(ts, phase_table, device)
            dev = dev_details[key]
            if phase in dev:
              logging.warning('Duplicate %s entry for device %s, +%f', phase,
                              device, secs)
              dev[phase] += secs
            else:
              dev[phase] = secs

        for dev_key, dev in dev_details.iteritems():
          total_secs = sum(dev.values())
          self.device_times[-1][dev_key] = total_secs
          report = '%s: %f TOT' % (dev_key, total_secs)
          for phase in dev.keys():
            if phase is 'REG':
              continue
            report += ', %f %s' % (dev[phase], phase)
          logging.debug(report)

    def _identify_driver(self, device):
        """Return the driver name of a device (or "unknown")."""
        for path, subdirs, _ in os.walk('/sys/devices'):
            if device in subdirs:
                node = os.path.join(path, device, 'driver')
                if not os.path.exists(node):
                    return "unknown"
                return os.path.basename(os.path.realpath(node))
        else:
            return "unknown"


    def _check_for_errors(self, ignore_kernel_warns):
        """Find and identify suspend errors.

        @param ignore_kernel_warns: Ignore kernel errors.

        @returns: True iff we should retry.

        @raises:
          sys_power.KernelError: for non-whitelisted kernel failures.
          sys_power.SuspendTimeout: took too long to enter suspend.
          sys_power.SpuriousWakeupError: woke too soon from suspend.
          sys_power.SuspendFailure: unidentified failure.
        """
        warning_regex = re.compile(r' kernel: \[.*WARNING:')
        abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort'
                r'| powerd_suspend\[.*Cancel suspend at kernel'
                r'| kernel: \[.*PM: Wakeup pending, aborting suspend')
        # rsyslogd can put this out of order with dmesg, so track in variable
        fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error')
        failed = False

        # TODO(scottz): warning_monitor crosbug.com/38092
        log_len = len(self._logs)
        for i in xrange(log_len):
            line = self._logs[i]
            if warning_regex.search(line):
                # match the source file from the WARNING line, and the
                # actual error text by peeking one or two lines below that
                src = cros_logging.strip_timestamp(line)
                text = ''
                if i+1 < log_len:
                    text = cros_logging.strip_timestamp(self._logs[i + 1])
                if i+2 < log_len:
                    text += '\n' + cros_logging.strip_timestamp(
                        self._logs[i + 2])
                for p1, p2 in sys_power.KernelError.WHITELIST:
                    if re.search(p1, src) and re.search(p2, text):
                        logging.info('Whitelisted KernelError: %s', src)
                        break
                else:
                    if ignore_kernel_warns:
                        logging.warn('Non-whitelisted KernelError: %s', src)
                    else:
                        raise sys_power.KernelError("%s\n%s" % (src, text))
            if abort_regex.search(line):
                wake_source = 'unknown'
                match = re.search(r'last active wakeup source: (.*)$',
                        '\n'.join(self._logs[i-5:i+3]), re.MULTILINE)
                if match:
                    wake_source = match.group(1)
                driver = self._identify_driver(wake_source)
                for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST:
                    if (re.search(b, utils.get_board()) and
                            re.search(w, wake_source)):
                        logging.warning('Whitelisted spurious wake before '
                                        'S3: %s | %s', wake_source, driver)
                        return True
                if "rtc" in driver:
                    raise sys_power.SuspendTimeout('System took too '
                                                   'long to suspend.')
                raise sys_power.SpuriousWakeupError('Spurious wake '
                        'before S3: %s | %s' % (wake_source, driver))
            if fail_regex.search(line):
                failed = True

        if failed:
            raise sys_power.SuspendFailure('Unidentified problem.')
        return False

    def suspend(self, duration=10, ignore_kernel_warns=False):
        """
        Do a single suspend for 'duration' seconds. Estimates the amount of time
        it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC
        wakeup delay will be longer. Returns None on errors, or raises the
        exception when _throw is set. Returns a dict of general measurements,
        or a tuple (general_measurements, individual_device_times) when
        _device_times is set.

        @param duration: time in seconds to do a suspend prior to waking.
        @param ignore_kernel_warns: Ignore kernel errors.  Defaults to false.
        """

        if power_utils.get_sleep_state() == 'freeze':
            self._s0ix_residency_stats = power_status.S0ixResidencyStats()

        try:
            iteration = len(self.failures) + len(self.successes) + 1
            # Retry suspend in case we hit a known (whitelisted) bug
            for _ in xrange(10):
                self._reset_logs()
                utils.system('sync')
                board_delay = self._SUSPEND_DELAY.get(self._get_board(),
                        self._DEFAULT_SUSPEND_DELAY)
                try:
                    alarm = self._suspend(duration + board_delay)
                except sys_power.SpuriousWakeupError:
                    # might be another error, we check for it ourselves below
                    alarm = self._ALARM_FORCE_EARLY_WAKEUP

                if os.path.exists('/sys/firmware/log'):
                    for msg in re.findall(r'^.*ERROR.*$',
                            utils.read_file('/sys/firmware/log'), re.M):
                        for board, pattern in sys_power.FirmwareError.WHITELIST:
                            if (re.search(board, utils.get_board()) and
                                    re.search(pattern, msg)):
                                logging.info('Whitelisted FW error: ' + msg)
                                break
                        else:
                            firmware_log = os.path.join(self._logdir,
                                    'firmware.log.' + str(iteration))
                            shutil.copy('/sys/firmware/log', firmware_log)
                            logging.info('Saved firmware log: ' + firmware_log)
                            raise sys_power.FirmwareError(msg.strip('\r\n '))

                self._update_logs()
                if not self._check_for_errors(ignore_kernel_warns):
                    hwclock_ts = self._hwclock_ts(alarm)
                    if hwclock_ts:
                        break

            else:
                raise error.TestWarn('Ten tries failed due to whitelisted bug')

            # calculate general measurements
            start_resume = self._ts('start_resume_time')
            kernel_down = (self._ts('end_suspend_time') -
                           self._ts('start_suspend_time'))
            kernel_up = self._ts('end_resume_time') - start_resume
            devices_up = self._device_resume_time()
            total_up = hwclock_ts - alarm
            firmware_up = self._firmware_resume_time()
            board_up = total_up - kernel_up - firmware_up
            try:
                cpu_up = self._ts('cpu_ready_time', 0) - start_resume
            except error.TestError:
                # can be missing on non-SMP machines
                cpu_up = None
            if total_up > self._MAX_RESUME_TIME:
                raise error.TestError('Sanity check failed: missed RTC wakeup.')

            logging.info('Success(%d): %g down, %g up, %g board, %g firmware, '
                         '%g kernel, %g cpu, %g devices',
                         iteration, kernel_down, total_up, board_up,
                         firmware_up, kernel_up, cpu_up, devices_up)

            if hasattr(self, '_s0ix_residency_stats'):
                s0ix_residency_secs = \
                        self._s0ix_residency_stats.\
                                get_accumulated_residency_secs()
                if not s0ix_residency_secs:
                    raise sys_power.S0ixResidencyNotChanged(
                        'S0ix residency did not change.')
                logging.info('S0ix residency : %d secs.', s0ix_residency_secs)

            self.successes.append({
                'seconds_system_suspend': kernel_down,
                'seconds_system_resume': total_up,
                'seconds_system_resume_firmware': firmware_up + board_up,
                'seconds_system_resume_firmware_cpu': firmware_up,
                'seconds_system_resume_firmware_ec': board_up,
                'seconds_system_resume_kernel': kernel_up,
                'seconds_system_resume_kernel_cpu': cpu_up,
                'seconds_system_resume_kernel_dev': devices_up,
                })

            if hasattr(self, 'device_times'):
                self._individual_device_times(start_resume)
                return (self.successes[-1], self.device_times[-1])
            else:
                return self.successes[-1]

        except sys_power.SuspendFailure as ex:
            message = '%s(%d): %s' % (type(ex).__name__, iteration, ex)
            logging.error(message)
            self.failures.append(ex)
            if self._throw:
                if type(ex).__name__ in ['KernelError', 'SuspendTimeout']:
                    raise error.TestWarn(message)
                else:
                    raise error.TestFail(message)
            return None


    def finalize(self):
        """Restore normal environment (not turning 3G back on for now...)"""
        if os.path.exists(self.HWCLOCK_FILE):
            os.remove(self.HWCLOCK_FILE)
            if self._restart_tlsdated:
                utils.system('initctl start tlsdated')
            if self._reset_pm_print_times:
                self._set_pm_print_times(False)
        if hasattr(self, '_power_pref_changer'):
            self._power_pref_changer.finalize()


    def __del__(self):
        self.finalize()