#!/usr/bin/python -u import os, socket, sys, signal, time, subprocess, logging from optparse import OptionParser import common from autotest_lib.scheduler import babysitter_logging_config from autotest_lib.client.common_lib import error, global_config, utils from autotest_lib.client.common_lib import logging_manager from autotest_lib.scheduler import scheduler_logging_config from autotest_lib.scheduler import status_server from autotest_lib.scheduler import monitor_db PAUSE_LENGTH = 60 STALL_TIMEOUT = 2*60*60 parser = OptionParser() parser.add_option("-r", action="store_true", dest="recover", help=("run recovery mode (implicit after any crash)")) parser.add_option("--background", dest="background", action="store_true", default=False, help=("runs the scheduler monitor on " "background")) (options, args) = parser.parse_args() autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) results_dir = os.path.join(autodir, 'results') monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py') recover = (options.recover == True) if len(args) != 0: parser.print_help() sys.exit(1) def run_banner_output(cmd): """Returns ------ CMD ------\nCMD_OUTPUT in a string""" banner_output = '%s\n%%s\n\n' % cmd.center(60, '-') command_output = '' try: cmd_out = utils.run(cmd, ignore_status=True, timeout=30) command_output = cmd_out.stdout + cmd_out.stderr except error.CmdError: command_output = 'Timed out' return banner_output % command_output def kill_monitor(): logging.info("Killing monitor_db") # try shutdown first utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT) if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed? # give it some time to shutdown time.sleep(30) # kill it utils.signal_process(monitor_db.PID_FILE_PREFIX) def handle_sigterm(signum, frame): logging.info('Caught SIGTERM') kill_monitor() utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX) sys.exit(1) signal.signal(signal.SIGTERM, handle_sigterm) SiteMonitorProc = utils.import_site_class( __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter', 'SiteMonitorProc', object) class MonitorProc(SiteMonitorProc): def __init__(self, do_recovery=False): args = [monitor_db_path] if do_recovery: args.append("--recover-hosts") args.append(results_dir) kill_monitor() environ = os.environ scheduler_config = scheduler_logging_config.SchedulerLoggingConfig log_name = scheduler_config.get_log_name() os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name scheduler_log_dir = scheduler_config.get_server_log_dir() self.log_path = os.path.join(scheduler_log_dir, log_name) self.log_size = 0 self.last_log_change = time.time() logging.info("STARTING monitor_db with log file %s" % self.log_path) self.args = args # Allow site specific code to run, set environment variables and # modify self.args if desired. super(MonitorProc, self).__init__() def start(self): devnull = open(os.devnull, 'w') self.proc = subprocess.Popen(self.args, stdout=devnull) def is_running(self): if self.proc.poll() is not None: logging.info("monitor_db DIED") return False old_size = self.log_size new_size = os.path.getsize(self.log_path) if old_size != new_size: logging.info("Log was touched") self.log_size = new_size self.last_log_change = time.time() elif self.last_log_change + STALL_TIMEOUT < time.time(): logging.info("monitor_db STALLED") self.collect_stalled_info() return False return True def collect_stalled_info(self): INFO_TO_COLLECT = ['uptime', 'ps auxwww', 'iostat -k -x 2 4', ] db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s' config = global_config.global_config try: user = config.get_config_value("BACKUP", "user") password = config.get_config_value("BACKUP", "password") db_cmd %= (user, password) INFO_TO_COLLECT.append(db_cmd) except global_config.ConfigError: pass stall_log_path = self.log_path + '.stall_info' log = open(stall_log_path, "w") for cmd in INFO_TO_COLLECT: log.write(run_banner_output(cmd)) log.close() if os.getuid() == 0: logging.critical("Running as root, aborting!") sys.exit(1) if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX): logging.critical("Monitor_db_babysitter already running, aborting!") sys.exit(1) utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX) if options.background: logging_manager.configure_logging( babysitter_logging_config.BabysitterLoggingConfig(use_console=False)) # Double fork - see http://code.activestate.com/recipes/66012/ try: pid = os.fork() if (pid > 0): sys.exit(0) # exit from first parent except OSError, e: sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) sys.exit(1) # Decouple from parent environment os.chdir("/") os.umask(0) os.setsid() # Second fork try: pid = os.fork() if (pid > 0): sys.exit(0) # exit from second parent except OSError, e: sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) sys.exit(1) else: logging_manager.configure_logging( babysitter_logging_config.BabysitterLoggingConfig()) while True: sock = socket.socket() try: # Try to bind to the same port as the status_server. sock.bind(('localhost', status_server._PORT)) except socket.error, msg: # If binding failed, open the port. logging.error('Failed to open socket with error:%s. Closing socket.', msg) release_port_cmd_list = ['fuser', '-k', '-n', 'tcp', '%d' % status_server._PORT] process = subprocess.Popen(release_port_cmd_list) process.wait() sock.close() proc = MonitorProc(do_recovery=recover) proc.start() time.sleep(PAUSE_LENGTH) while proc.is_running(): logging.info("Tick") time.sleep(PAUSE_LENGTH) recover = False