普通文本  |  300行  |  9.46 KB

#!/usr/bin/env python
# Copyright 2017 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Standalone service to monitor AFE servers and report to ts_mon"""
import sys
import time
import logging
import multiprocessing
import urllib2

import common
from autotest_lib.client.common_lib import global_config
from autotest_lib.frontend.afe.json_rpc import proxy
from autotest_lib.server import frontend
# import needed to setup host_attributes
# pylint: disable=unused-import
from autotest_lib.server import site_host_attributes
from autotest_lib.site_utils import server_manager_utils
from chromite.lib import commandline
from chromite.lib import metrics
from chromite.lib import ts_mon_config

METRIC_ROOT = 'chromeos/autotest/blackbox/afe_rpc'
METRIC_RPC_CALL_DURATIONS = METRIC_ROOT + '/rpc_call_durations'
METRIC_TICK = METRIC_ROOT + '/tick'
METRIC_MONITOR_ERROR = METRIC_ROOT + '/afe_monitor_error'

FAILURE_REASONS = {
        proxy.JSONRPCException: 'JSONRPCException',
        }

def afe_rpc_call(hostname):
    """Perform one rpc call set on server

    @param hostname: server's hostname to poll
    """
    afe_monitor = AfeMonitor(hostname)
    try:
        afe_monitor.run()
    except Exception as e:
        metrics.Counter(METRIC_MONITOR_ERROR).increment(
                fields={'target_hostname': hostname})
        logging.exception('Exception when running against host %s', hostname)


def update_shards(shards, shards_lock, period=600, stop_event=None):
    """Updates dict of shards

    @param shards: list of shards to be updated
    @param shards_lock: shared lock for accessing shards
    @param period: time between polls
    @param stop_event: Event that can be set to stop polling
    """
    while(not stop_event or not stop_event.is_set()):
        start_time = time.time()

        logging.debug('Updating Shards')
        new_shards = set(server_manager_utils.get_shards())

        with shards_lock:
            current_shards = set(shards)
            rm_shards = current_shards - new_shards
            add_shards = new_shards - current_shards

            if rm_shards:
                for s in rm_shards:
                    shards.remove(s)

            if add_shards:
                shards.extend(add_shards)

        if rm_shards:
            logging.info('Servers left production: %s', str(rm_shards))

        if add_shards:
            logging.info('Servers entered production: %s',
                    str(add_shards))

        wait_time = (start_time + period) - time.time()
        if wait_time > 0:
            time.sleep(wait_time)


def poll_rpc_servers(servers, servers_lock, shards=None, period=60,
                     stop_event=None):
    """Blocking function that polls all servers and shards

    @param servers: list of servers to poll
    @param servers_lock: lock to be used when accessing servers or shards
    @param shards: list of shards to poll
    @param period: time between polls
    @param stop_event: Event that can be set to stop polling
    """
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() * 4)

    while(not stop_event or not stop_event.is_set()):
        start_time = time.time()
        with servers_lock:
            all_servers = set(servers).union(shards)

        logging.debug('Starting Server Polling: %s', ', '.join(all_servers))
        pool.map(afe_rpc_call, all_servers)

        logging.debug('Finished Server Polling')

        metrics.Counter(METRIC_TICK).increment()

        wait_time = (start_time + period) - time.time()
        if wait_time > 0:
            time.sleep(wait_time)


class RpcFlightRecorder(object):
    """Monitors a list of AFE"""
    def __init__(self, servers, with_shards=True, poll_period=60):
        """
        @param servers: list of afe services to monitor
        @param with_shards: also record status on shards
        @param poll_period: frequency to poll all services, in seconds
        """
        self._manager = multiprocessing.Manager()

        self._poll_period = poll_period

        self._servers = self._manager.list(servers)
        self._servers_lock = self._manager.RLock()

        self._with_shards = with_shards
        self._shards = self._manager.list()
        self._update_shards_ps = None
        self._poll_rpc_server_ps = None

        self._stop_event = multiprocessing.Event()

    def start(self):
        """Call to start recorder"""
        if(self._with_shards):
            shard_args = [self._shards, self._servers_lock]
            shard_kwargs = {'stop_event': self._stop_event}
            self._update_shards_ps = multiprocessing.Process(
                    name='update_shards',
                    target=update_shards,
                    args=shard_args,
                    kwargs=shard_kwargs)

            self._update_shards_ps.start()

        poll_args = [self._servers, self._servers_lock]
        poll_kwargs= {'shards':self._shards,
                     'period':self._poll_period,
                     'stop_event':self._stop_event}
        self._poll_rpc_server_ps = multiprocessing.Process(
                name='poll_rpc_servers',
                target=poll_rpc_servers,
                args=poll_args,
                kwargs=poll_kwargs)

        self._poll_rpc_server_ps.start()

    def close(self):
        """Send close event to all sub processes"""
        self._stop_event.set()


    def termitate(self):
        """Terminate processes"""
        self.close()
        if self._poll_rpc_server_ps:
            self._poll_rpc_server_ps.terminate()

        if self._update_shards_ps:
            self._update_shards_ps.terminate()

        if self._manager:
            self._manager.shutdown()


    def join(self, timeout=None):
        """Blocking call until closed and processes complete

        @param timeout: passed to each process, so could be >timeout"""
        if self._poll_rpc_server_ps:
            self._poll_rpc_server_ps.join(timeout)

        if self._update_shards_ps:
            self._update_shards_ps.join(timeout)

def _failed(fields, msg_str, reason, err=None):
    """Mark current run failed

    @param fields, ts_mon fields to mark as failed
    @param msg_str, message string to be filled
    @param reason: why it failed
    @param err: optional error to log more debug info
    """
    fields['success'] = False
    fields['failure_reason'] = reason
    logging.warning("%s failed - %s", msg_str, reason)
    if err:
        logging.debug("%s fail_err - %s", msg_str, str(err))

class AfeMonitor(object):
    """Object that runs rpc calls against the given afe frontend"""

    def __init__(self, hostname):
        """
        @param hostname: hostname of server to monitor, string
        """
        self._hostname = hostname
        self._afe = frontend.AFE(server=self._hostname)
        self._metric_fields = {'target_hostname': self._hostname}


    def run_cmd(self, cmd, expected=None):
        """Runs rpc command and log metrics

        @param cmd: string of rpc command to send
        @param expected: expected result of rpc
        """
        metric_fields = self._metric_fields.copy()
        metric_fields['command'] = cmd
        metric_fields['success'] = True
        metric_fields['failure_reason'] = ''

        with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS,
                fields=dict(metric_fields), scale=0.001) as f:

            msg_str = "%s:%s" % (self._hostname, cmd)


            try:
                result = self._afe.run(cmd)
                logging.debug("%s result = %s", msg_str, result)
                if expected is not None and expected != result:
                    _failed(f, msg_str, 'IncorrectResponse')

            except urllib2.HTTPError as e:
                _failed(f, msg_str, 'HTTPError:%d' % e.code)

            except Exception as e:
                _failed(f, msg_str, FAILURE_REASONS.get(type(e), 'Unknown'),
                        err=e)

                if type(e) not in FAILURE_REASONS:
                    raise

            if f['success']:
                logging.info("%s success", msg_str)


    def run(self):
        """Tests server and returns the result"""
        self.run_cmd('get_server_time')
        self.run_cmd('ping_db', [True])


def get_parser():
    """Returns argparse parser"""
    parser = commandline.ArgumentParser(description=__doc__)

    parser.add_argument('-a', '--afe', action='append', default=[],
                        help='Autotest FrontEnd server to monitor')

    parser.add_argument('-p', '--poll-period', type=int, default=60,
                        help='Frequency to poll AFE servers')

    parser.add_argument('--no-shards', action='store_false', dest='with_shards',
                        help='Disable shard updating')

    return parser


def main(argv):
    """Main function

    @param argv: commandline arguments passed
    """
    parser = get_parser()
    options = parser.parse_args(argv[1:])


    if not options.afe:
        options.afe = [global_config.global_config.get_config_value(
                        'SERVER', 'global_afe_hostname', default='cautotest')]

    with ts_mon_config.SetupTsMonGlobalState('rpc_flight_recorder',
                                             indirect=True):
        flight_recorder = RpcFlightRecorder(options.afe,
                                            with_shards=options.with_shards,
                                            poll_period=options.poll_period)

        flight_recorder.start()
        flight_recorder.join()


if __name__ == '__main__':
    main(sys.argv)