/**
 * @file opd_perfmon.c
 * perfmonctl() handling
 *
 * @remark Copyright 2003 OProfile authors
 * @remark Read the file COPYING
 *
 * @author John Levon
 */

#ifdef __ia64__

/* need this for sched_setaffinity() in <sched.h> */
#define _GNU_SOURCE

#include "oprofiled.h"
#include "opd_perfmon.h"
#include "opd_events.h"

#include "op_cpu_type.h"
#include "op_libiberty.h"
#include "op_hw_config.h"

#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#include <limits.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#ifdef HAVE_SCHED_SETAFFINITY
#include <sched.h>
#endif

extern op_cpu cpu_type;

#ifndef HAVE_SCHED_SETAFFINITY

/* many glibc's are not yet up to date */
#ifndef __NR_sched_setaffinity
#define __NR_sched_setaffinity 1231
#endif

/* Copied from glibc's <sched.h> and <bits/sched.h> and munged */
#define CPU_SETSIZE	1024
#define __NCPUBITS	(8 * sizeof (unsigned long))
typedef struct
{
	unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
} cpu_set_t;

#define CPU_SET(cpu, cpusetp) \
	((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
#define CPU_ZERO(cpusetp) \
	memset((cpusetp), 0, sizeof(cpu_set_t))

static int
sched_setaffinity(pid_t pid, size_t len, cpu_set_t const * cpusetp)
{
	return syscall(__NR_sched_setaffinity, pid, len, cpusetp);
}
#endif


#ifndef HAVE_PERFMONCTL
#ifndef __NR_perfmonctl
#define __NR_perfmonctl 1175
#endif

static int perfmonctl(int fd, int cmd, void * arg, int narg)
{
	return syscall(__NR_perfmonctl, fd, cmd, arg, narg);
}
#endif


static unsigned char uuid[16] = {
	0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69,
	0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c
};


static size_t nr_cpus;

struct child {
	pid_t pid;
	int up_pipe[2];
	int ctx_fd;
	sig_atomic_t sigusr1;
	sig_atomic_t sigusr2;
	sig_atomic_t sigterm;
};

static struct child * children;

static void perfmon_start_child(int ctx_fd)
{
	if (perfmonctl(ctx_fd, PFM_START, 0, 0) == -1) {
		exit(EXIT_FAILURE);
	}
}


static void perfmon_stop_child(int ctx_fd)
{
	if (perfmonctl(ctx_fd, PFM_STOP, 0, 0) == -1) {
		exit(EXIT_FAILURE);
	}
}


static void child_sigusr1(int val __attribute__((unused)))
{
	size_t i;

	for (i = 0; i < nr_cpus; ++i) {
		if (children[i].pid == getpid()) {
			children[i].sigusr1 = 1;
			return;
		}
	}
}


static void child_sigusr2(int val __attribute__((unused)))
{
	size_t i;

	for (i = 0; i < nr_cpus; ++i) {
		if (children[i].pid == getpid()) {
			children[i].sigusr2 = 1;
			return;
		}
	}
}


static void child_sigterm(int val __attribute__((unused)))
{
	kill(getppid(), SIGTERM);
}


static void set_affinity(size_t cpu)
{
	cpu_set_t set;
	int err;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);

	err = sched_setaffinity(getpid(), sizeof(set), &set);

	if (err == -1) {
		perror("Failed to set affinity");
		exit(EXIT_FAILURE);
	}
}


static void setup_signals(void)
{
	struct sigaction act;
	sigset_t mask;

	sigemptyset(&mask);
	sigaddset(&mask, SIGUSR1);
	sigaddset(&mask, SIGUSR2);
	sigprocmask(SIG_BLOCK, &mask, NULL);

	act.sa_handler = child_sigusr1;
	act.sa_flags = 0;
	sigemptyset(&act.sa_mask);

	if (sigaction(SIGUSR1, &act, NULL)) {
		perror("oprofiled: install of SIGUSR1 handler failed");
		exit(EXIT_FAILURE);
	}

	act.sa_handler = child_sigusr2;
	act.sa_flags = 0;
	sigemptyset(&act.sa_mask);

	if (sigaction(SIGUSR2, &act, NULL)) {
		perror("oprofiled: install of SIGUSR2 handler failed");
		exit(EXIT_FAILURE);
	}

	act.sa_handler = child_sigterm;
	act.sa_flags = 0;
	sigemptyset(&act.sa_mask);

	if (sigaction(SIGTERM, &act, NULL)) {
		perror("oprofiled: install of SIGTERM handler failed");
		exit(EXIT_FAILURE);
	}
}


/** create the per-cpu context */
static void create_context(struct child * self)
{
	pfarg_context_t ctx;
	int err;

	memset(&ctx, 0, sizeof(pfarg_context_t));
	memcpy(&ctx.ctx_smpl_buf_id, &uuid, 16);
	ctx.ctx_flags = PFM_FL_SYSTEM_WIDE;

	err = perfmonctl(0, PFM_CREATE_CONTEXT, &ctx, 1);
	if (err == -1) {
		perror("CREATE_CONTEXT failed");
		exit(EXIT_FAILURE);
	}

	self->ctx_fd = ctx.ctx_fd;
}


/** program the perfmon counters */
static void write_pmu(struct child * self)
{
	pfarg_reg_t pc[OP_MAX_COUNTERS];
	pfarg_reg_t pd[OP_MAX_COUNTERS];
	int err;
	size_t i;

	memset(pc, 0, sizeof(pc));
	memset(pd, 0, sizeof(pd));

#define PMC_GEN_INTERRUPT (1UL << 5)
#define PMC_PRIV_MONITOR (1UL << 6)
/* McKinley requires pmc4 to have bit 23 set (enable PMU).
 * It is supposedly ignored in other pmc registers.
 */
#define PMC_MANDATORY (1UL << 23)
#define PMC_USER (1UL << 3)
#define PMC_KERNEL (1UL << 0)
	for (i = 0; i < op_nr_counters && opd_events[i].name; ++i) {
		struct opd_event * event = &opd_events[i];
		pc[i].reg_num = event->counter + 4;
		pc[i].reg_value = PMC_GEN_INTERRUPT;
		pc[i].reg_value |= PMC_PRIV_MONITOR;
		pc[i].reg_value |= PMC_MANDATORY;
		(event->user) ? (pc[i].reg_value |= PMC_USER)
		              : (pc[i].reg_value &= ~PMC_USER);
		(event->kernel) ? (pc[i].reg_value |= PMC_KERNEL)
		                : (pc[i].reg_value &= ~PMC_KERNEL);
		pc[i].reg_value &= ~(0xff << 8);
		pc[i].reg_value |= ((event->value & 0xff) << 8);
		pc[i].reg_value &= ~(0xf << 16);
		pc[i].reg_value |= ((event->um & 0xf) << 16);
		pc[i].reg_smpl_eventid = event->counter;
	}

	for (i = 0; i < op_nr_counters && opd_events[i].name; ++i) {
		struct opd_event * event = &opd_events[i];
		pd[i].reg_value = ~0UL - event->count + 1;
		pd[i].reg_short_reset = ~0UL - event->count + 1;
		pd[i].reg_num = event->counter + 4;
	}

	err = perfmonctl(self->ctx_fd, PFM_WRITE_PMCS, pc, i);
	if (err == -1) {
		perror("Couldn't write PMCs");
		exit(EXIT_FAILURE);
	}

	err = perfmonctl(self->ctx_fd, PFM_WRITE_PMDS, pd, i);
	if (err == -1) {
		perror("Couldn't write PMDs");
		exit(EXIT_FAILURE);
	}
}


static void load_context(struct child * self)
{
	pfarg_load_t load_args;
	int err;

	memset(&load_args, 0, sizeof(load_args));
	load_args.load_pid = self->pid;

	err = perfmonctl(self->ctx_fd, PFM_LOAD_CONTEXT, &load_args, 1);
	if (err == -1) {
		perror("Couldn't load context");
		exit(EXIT_FAILURE);
	}
}


static void notify_parent(struct child * self, size_t cpu)
{
	for (;;) {
		ssize_t ret;
		ret = write(self->up_pipe[1], &cpu, sizeof(size_t));
		if (ret == sizeof(size_t))
			break;
		if (ret < 0 && errno != EINTR) {
			perror("Failed to write child pipe:");
			exit(EXIT_FAILURE);
		}
	}
}

static struct child * inner_child;
void close_pipe(void)
{
	close(inner_child->up_pipe[1]);
}

static void run_child(size_t cpu)
{
	struct child * self = &children[cpu];

	self->pid = getpid();
	self->sigusr1 = 0;
	self->sigusr2 = 0;
	self->sigterm = 0;

	inner_child = self;
	if (atexit(close_pipe)){
		close_pipe();
		exit(EXIT_FAILURE);
	}

	umask(0);
	/* Change directory to allow directory to be removed */
	if (chdir("/") < 0) {
		perror("Unable to chdir to \"/\"");
		exit(EXIT_FAILURE);
	}

	setup_signals();

	set_affinity(cpu);

	create_context(self);

	write_pmu(self);

	load_context(self);

	notify_parent(self, cpu);

	/* Redirect standard files to /dev/null */
	freopen( "/dev/null", "r", stdin);
	freopen( "/dev/null", "w", stdout);
	freopen( "/dev/null", "w", stderr);

	for (;;) {
		sigset_t sigmask;
		sigfillset(&sigmask);
		sigdelset(&sigmask, SIGUSR1);
		sigdelset(&sigmask, SIGUSR2);
		sigdelset(&sigmask, SIGTERM);

		if (self->sigusr1) {
			perfmon_start_child(self->ctx_fd);
			self->sigusr1 = 0;
		}

		if (self->sigusr2) {
			perfmon_stop_child(self->ctx_fd);
			self->sigusr2 = 0;
		}

		sigsuspend(&sigmask);
	}
}


static void wait_for_child(struct child * child)
{
	size_t tmp;
	for (;;) {
		ssize_t ret;
		ret = read(child->up_pipe[0], &tmp, sizeof(size_t));
		if (ret == sizeof(size_t))
			break;
		if ((ret < 0 && errno != EINTR) || ret == 0 ) {
			perror("Failed to read child pipe");
			exit(EXIT_FAILURE);
		}
	}
	printf("Perfmon child up on CPU%d\n", (int)tmp);
	fflush(stdout);

	close(child->up_pipe[0]);
}

static struct child* xen_ctx;

void perfmon_init(void)
{
	size_t i;
	long nr;

	if (cpu_type == CPU_TIMER_INT)
		return;

	if (!no_xen) {
		xen_ctx = xmalloc(sizeof(struct child));
		xen_ctx->pid = getpid();
		xen_ctx->up_pipe[0] = -1;
		xen_ctx->up_pipe[1] = -1;
		xen_ctx->sigusr1 = 0;
		xen_ctx->sigusr2 = 0;
		xen_ctx->sigterm = 0;

		create_context(xen_ctx);

		write_pmu(xen_ctx);
		
		load_context(xen_ctx);
		return;
	}
	

	nr = sysconf(_SC_NPROCESSORS_ONLN);
	if (nr == -1) {
		fprintf(stderr, "Couldn't determine number of CPUs.\n");
		exit(EXIT_FAILURE);
	}

	nr_cpus = nr;

	children = xmalloc(sizeof(struct child) * nr_cpus);
	bzero(children, sizeof(struct child) * nr_cpus);

	for (i = 0; i < nr_cpus; ++i) {
		int ret;

		if (pipe(children[i].up_pipe)) {
			perror("Couldn't create child pipe");
			exit(EXIT_FAILURE);
		}

		ret = fork();
		if (ret == -1) {
			perror("Couldn't fork perfmon child");
			exit(EXIT_FAILURE);
		} else if (ret == 0) {
			close(children[i].up_pipe[0]);
			run_child(i);
		} else {
			children[i].pid = ret;
			close(children[i].up_pipe[1]);
			printf("Waiting on CPU%d\n", (int)i);
			wait_for_child(&children[i]);
		}
	}
}


void perfmon_exit(void)
{
	size_t i;

	if (cpu_type == CPU_TIMER_INT)
		return;

	if (!no_xen)
		return;

	for (i = 0; i < nr_cpus; ++i) {
		if (children[i].pid) {
			int c_pid = children[i].pid;
			children[i].pid = 0;
			if (kill(c_pid, SIGKILL)==0)
				waitpid(c_pid, NULL, 0);
		}
	}
}


void perfmon_start(void)
{
	size_t i;

	if (cpu_type == CPU_TIMER_INT)
		return;

	if (!no_xen) {
		perfmon_start_child(xen_ctx->ctx_fd);
		return;
	}

	for (i = 0; i < nr_cpus; ++i) {
		if (kill(children[i].pid, SIGUSR1)) {
			perror("Unable to start perfmon");
			exit(EXIT_FAILURE);
		}
	}
}


void perfmon_stop(void)
{
	size_t i;

	if (cpu_type == CPU_TIMER_INT)
		return;

	if (!no_xen) {
		perfmon_stop_child(xen_ctx->ctx_fd);
		return;
	}
	
	for (i = 0; i < nr_cpus; ++i)
		if (kill(children[i].pid, SIGUSR2)) {
			perror("Unable to stop perfmon");
			exit(EXIT_FAILURE);
		}
}

#endif /* __ia64__ */