普通文本  |  102行  |  2.71 KB

#!/usr/bin/python

"""
Poll server-status on cautotest to watch for RPCs taking longer than 10s. Then
we go and ssh around to figure out what the command line of the process that
caused the RPC was so that one can track down what is generating the expensive
RPC load.
"""

try:
  from bs4 import BeautifulSoup
except ImportError:
  print 'Run `apt-get install python-bs4`'
  raise

import time
import subprocess
import multiprocessing

import common
import requests


def check_cautotest():
  page = requests.get('http://cautotest/server-status').text
  soup = BeautifulSoup(page)
  pids = []
  for row in soup.table.findAll('tr'):
    cols = [x.text.strip() for x in row.findAll('td')]
    if not cols:
      continue
    if cols[3] == 'W' and int(cols[5]) > 10 and cols[1] != '-':
      pids.append((cols[1], cols[3], cols[5]))
  return pids

def pull_cautotest_info(proc_id):
  try:
    conn = subprocess.check_output('become chromeos-test@cautotest -- '
           '"sudo lsof -i | grep -e %s | grep -e ESTABLISHED"' % proc_id,
           shell=True)
    remote_info = conn.split()[8].split('->')[1].split(':')
  except Exception:
    remote_info = None
  return remote_info

def strace_cautotest(proc_id):
  try:
    straced = subprocess.check_output('become chromeos-test@cautotest -- '
              '"sudo strace -s 500 -p %s 2>&1 | head -n 20"' % proc_id,
              shell=True)
  except subprocess.CalledProcessError:
    straced = ""
  return straced

def pull_drone_info(host, port):
  try:
    lsof = subprocess.check_output('become chromeos-test@%s -- '
           '"sudo lsof -i | grep -e :%s | grep -e ESTABLISHED"'
           % (host, port), shell=True)
    proc_id = lsof.split()[1]
    cmdline = subprocess.check_output('become chromeos-test@%s -- '
              '"cat /proc/%s/cmdline"' % (host,proc_id), shell=True)
  except Exception:
    cmdline = ''
  return cmdline

def pull_all_data(pid, queue):
  try:
    remote_info = pull_cautotest_info(pid[0])
    if remote_info:
      drone_info = pull_drone_info(*remote_info)
    else:
      drone_info = None
    straced = strace_cautotest(pid[0])
    queue.put((pid, remote_info, drone_info, straced))
  except Exception:
    queue.put(None)

def print_data(x):
    (pid, remote_info, drone_info, straced) = x
    print "*** %s stuck in %s for %s secs" % pid
    print remote_info
    print drone_info
    print straced
    print '\a'

while True:
  queue = multiprocessing.Queue()
  processes = []
  pids = check_cautotest()
  for pid in pids:
    proc = multiprocessing.Process(target=pull_all_data, args=(pid, queue))
    proc.start()
    processes.append(proc)
  for proc in processes:
    x = queue.get()
    if x:
      print_data(x)
  for proc in processes:
    proc.terminate()
  time.sleep(5)