普通文本  |  129行  |  4.84 KB

# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# This file lets us test the repair supporting code.
# We could not easily unit test it if it was in the repair file as it makes
# a function call that is not protected by a __name__ == ??? guard.

import datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib

import common

from autotest_lib.client.common_lib import global_config, mail, logging_config
from autotest_lib.server import frontend
from autotest_lib.server.cros.dynamic_suite import reporting


# Receiver and sender information, if we need to send an email
_NOTIFY_ADDRESS = global_config.global_config.get_config_value(
    'SCHEDULER', 'notify_email_errors', default='')
_SENDER_ADDRESS = global_config.global_config.get_config_value(
    'SCHEDULER', "notify_email_from", default=getpass.getuser())

# Ignore any jobs that were ran more than this many mins past the max job
# timeout.
_CUTOFF_AFTER_TIMEOUT_MINS = 60
_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value(
    'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int,
    default=0)


class MachineDeathLogger(logging_config.LoggingConfig):
    """
    Used to log information about a machine going into the Repair Failed state.

    We use this so that if the default log location ever changes it will also
    change for this logger and to keep this information separate from the
    other logs.

    """
    file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s',
                                       datefmt='%m/%d %H:%M:%S')
    LOGFILE_NAME = 'machine_death.log'

    def __init__(self):
        super(MachineDeathLogger, self).__init__(False)
        self.logger = logging.getLogger('machine_death')

        super(MachineDeathLogger, self).configure_logging(use_console=False)
        log_dir = self.get_server_log_dir()
        self.add_file_handler(self.LOGFILE_NAME, logging.ERROR,
                              log_dir=log_dir)


def _find_problem_test(machine, rpc):
    """
    Find the last job that ran on the machine.

    Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS.
    If global_config doesn't have a job_max_runtime_mins_default we will search
    only as far as _CUTOFF_AFTER_TIMEOUT_MINS.

    @param machine: The hostname (e.g. IP address) of the machine to find the
        last ran job on it.

    @param rpc: The rpc object to contact the server with.

    @return the job status dictionary for the job that last ran on the machine
        or None if there is no such job.
    """

    # Going through the RPC interface means we cannot use the latest() django
    # QuerySet function. So we will instead look at the past
    # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
    # and pick the most recent run from there.
    cutoff = (datetime.datetime.today() -
              datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) -
              datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))

    results = rpc.run('get_host_queue_entries', host__hostname=machine,
                      started_on__gte=str(cutoff))

    if results:
        return max(results, key=operator.itemgetter('started_on'))
    else:
        return None


def flag_problem_test(machine):
    """
    Notify people about the last job that ran on a machine.

    This method is invoked everytime a machine fails to repair, and attempts
    to identify the last test that ran on the machine. If successfull, it files
    a bug, or sends out an email, or just logs the fact.

    @param machine: The hostname (e.g. IP address) of the machine to find the
        last job ran on it.

    """
    rpc = frontend.AFE()
    logger = MachineDeathLogger()

    try:
        problem_test = _find_problem_test(machine, rpc)
    except (urllib2.URLError, xmlrpclib.ProtocolError):
        logger.logger.error('%s | ERROR: Could not contact RPC server'
                            % machine)
        return

    if problem_test:
        job_id = problem_test['job']['id']
        job_name = problem_test['job']['name']
        bug = reporting.MachineKillerBug(job_id=job_id,
                                         job_name=job_name,
                                         machine=machine)
        reporter = reporting.Reporter()
        bug_id = reporter.report(bug)[0]

        if bug_id is None:
            try:
                email_prefix = ('The following test is killing a machine, '
                                'could not file a bug to report this:\n\n')
                mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '',
                          bug.title(), email_prefix + bug.summary())
            except smtplib.SMTPDataError:
                logger.logger.error('%s | %d | %s'
                                    % (machine, job_id, job_name))