#include <math.h>
#include "json.h"
#include "idletime.h"

static volatile struct idle_prof_common ipc;

/*
 * Get time to complete an unit work on a particular cpu.
 * The minimum number in CALIBRATE_RUNS runs is returned.
 */
static double calibrate_unit(unsigned char *data)
{
	unsigned long t, i, j, k;
	struct timeval tps;
	double tunit = 0.0;

	for (i = 0; i < CALIBRATE_RUNS; i++) {

		fio_gettime(&tps, NULL);
		/* scale for less variance */
		for (j = 0; j < CALIBRATE_SCALE; j++) {
			/* unit of work */
			for (k=0; k < page_size; k++) {
				data[(k + j) % page_size] = k % 256;
				/*
				 * we won't see STOP here. this is to match
				 * the same statement in the profiling loop.
				 */
				if (ipc.status == IDLE_PROF_STATUS_PROF_STOP)
					return 0.0;
			}
		}

		t = utime_since_now(&tps);
		if (!t)
			continue;

		/* get the minimum time to complete CALIBRATE_SCALE units */
		if ((i == 0) || ((double)t < tunit))
			tunit = (double)t;
	}

	return tunit / CALIBRATE_SCALE;
}

static void free_cpu_affinity(struct idle_prof_thread *ipt)
{
#if defined(FIO_HAVE_CPU_AFFINITY)
	fio_cpuset_exit(&ipt->cpu_mask);
#endif
}

static int set_cpu_affinity(struct idle_prof_thread *ipt)
{
#if defined(FIO_HAVE_CPU_AFFINITY)
	if (fio_cpuset_init(&ipt->cpu_mask)) {
		log_err("fio: cpuset init failed\n");
		return -1;
	}

	fio_cpu_set(&ipt->cpu_mask, ipt->cpu);

	if (fio_setaffinity(gettid(), ipt->cpu_mask)) {
		log_err("fio: fio_setaffinity failed\n");
		fio_cpuset_exit(&ipt->cpu_mask);
		return -1;
	}

	return 0;
#else
	log_err("fio: fio_setaffinity not supported\n");
	return -1;
#endif
}

static void *idle_prof_thread_fn(void *data)
{
	int retval;
	unsigned long j, k;
	struct idle_prof_thread *ipt = data;

	/* wait for all threads are spawned */
	pthread_mutex_lock(&ipt->init_lock);

	/* exit if any other thread failed to start */
	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
		pthread_mutex_unlock(&ipt->init_lock);
		return NULL;
	}

	retval = set_cpu_affinity(ipt);
	if (retval == -1) {
		ipt->state = TD_EXITED;
		pthread_mutex_unlock(&ipt->init_lock);
		return NULL;
        }

	ipt->cali_time = calibrate_unit(ipt->data);

	/* delay to set IDLE class till now for better calibration accuracy */
#if defined(CONFIG_SCHED_IDLE)
	if ((retval = fio_set_sched_idle()))
		log_err("fio: fio_set_sched_idle failed\n");
#else
	retval = -1;
	log_err("fio: fio_set_sched_idle not supported\n");
#endif
	if (retval == -1) {
		ipt->state = TD_EXITED;
		pthread_mutex_unlock(&ipt->init_lock);
		goto do_exit;
	}

	ipt->state = TD_INITIALIZED;

	/* signal the main thread that calibration is done */
	pthread_cond_signal(&ipt->cond);
	pthread_mutex_unlock(&ipt->init_lock);

	/* wait for other calibration to finish */
	pthread_mutex_lock(&ipt->start_lock);

	/* exit if other threads failed to initialize */
	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
		pthread_mutex_unlock(&ipt->start_lock);
		goto do_exit;
	}

	/* exit if we are doing calibration only */
	if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) {
		pthread_mutex_unlock(&ipt->start_lock);
		goto do_exit;
	}

	fio_gettime(&ipt->tps, NULL);
	ipt->state = TD_RUNNING;

	j = 0;
	while (1) {
		for (k = 0; k < page_size; k++) {
			ipt->data[(k + j) % page_size] = k % 256;
			if (ipc.status == IDLE_PROF_STATUS_PROF_STOP) {
				fio_gettime(&ipt->tpe, NULL);
				goto idle_prof_done;
			}
		}
		j++;
	}

idle_prof_done:

	ipt->loops = j + (double) k / page_size;
	ipt->state = TD_EXITED;
	pthread_mutex_unlock(&ipt->start_lock);

do_exit:
	free_cpu_affinity(ipt);
	return NULL;
}

/* calculate mean and standard deviation to complete an unit of work */
static void calibration_stats(void)
{
	int i;
	double sum = 0.0, var = 0.0;
	struct idle_prof_thread *ipt;

	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		sum += ipt->cali_time;
	}

	ipc.cali_mean = sum/ipc.nr_cpus;

	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		var += pow(ipt->cali_time-ipc.cali_mean, 2);
	}

	ipc.cali_stddev = sqrt(var/(ipc.nr_cpus-1));
}

void fio_idle_prof_init(void)
{
	int i, ret;
	struct timeval tp;
	struct timespec ts;
	pthread_attr_t tattr;
	struct idle_prof_thread *ipt;

	ipc.nr_cpus = cpus_online();
	ipc.status = IDLE_PROF_STATUS_OK;

	if (ipc.opt == IDLE_PROF_OPT_NONE)
		return;

	if ((ret = pthread_attr_init(&tattr))) {
		log_err("fio: pthread_attr_init %s\n", strerror(ret));
		return;
	}
	if ((ret = pthread_attr_setscope(&tattr, PTHREAD_SCOPE_SYSTEM))) {
		log_err("fio: pthread_attr_setscope %s\n", strerror(ret));
		return;
	}

	ipc.ipts = malloc(ipc.nr_cpus * sizeof(struct idle_prof_thread));
	if (!ipc.ipts) {
		log_err("fio: malloc failed\n");
		return;
	}

	ipc.buf = malloc(ipc.nr_cpus * page_size);
	if (!ipc.buf) {
		log_err("fio: malloc failed\n");
		free(ipc.ipts);
		return;
	}

	/*
	 * profiling aborts on any single thread failure since the
	 * result won't be accurate if any cpu is not used.
	 */
	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];

		ipt->cpu = i;	
		ipt->state = TD_NOT_CREATED;
		ipt->data = (unsigned char *)(ipc.buf + page_size * i);

		if ((ret = pthread_mutex_init(&ipt->init_lock, NULL))) {
			ipc.status = IDLE_PROF_STATUS_ABORT;
			log_err("fio: pthread_mutex_init %s\n", strerror(ret));
			break;
		}

		if ((ret = pthread_mutex_init(&ipt->start_lock, NULL))) {
			ipc.status = IDLE_PROF_STATUS_ABORT;
			log_err("fio: pthread_mutex_init %s\n", strerror(ret));
			break;
		}

		if ((ret = pthread_cond_init(&ipt->cond, NULL))) {
			ipc.status = IDLE_PROF_STATUS_ABORT;
			log_err("fio: pthread_cond_init %s\n", strerror(ret));
			break;
		}

		/* make sure all threads are spawned before they start */
		pthread_mutex_lock(&ipt->init_lock);

		/* make sure all threads finish init before profiling starts */
		pthread_mutex_lock(&ipt->start_lock);

		if ((ret = pthread_create(&ipt->thread, &tattr, idle_prof_thread_fn, ipt))) {
			ipc.status = IDLE_PROF_STATUS_ABORT;
			log_err("fio: pthread_create %s\n", strerror(ret));
			break;
		} else
			ipt->state = TD_CREATED;

		if ((ret = pthread_detach(ipt->thread))) {
			/* log error and let the thread spin */
			log_err("fio: pthread_detatch %s\n", strerror(ret));
		}
	}

	/*
	 * let good threads continue so that they can exit
	 * if errors on other threads occurred previously.
	 */
	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		pthread_mutex_unlock(&ipt->init_lock);
	}
	
	if (ipc.status == IDLE_PROF_STATUS_ABORT)
		return;
	
	/* wait for calibration to finish */
	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		pthread_mutex_lock(&ipt->init_lock);
		while ((ipt->state != TD_EXITED) &&
		       (ipt->state!=TD_INITIALIZED)) {
			fio_gettime(&tp, NULL);
			ts.tv_sec = tp.tv_sec + 1;
			ts.tv_nsec = tp.tv_usec * 1000;
			pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts);
		}
		pthread_mutex_unlock(&ipt->init_lock);
	
		/*
		 * any thread failed to initialize would abort other threads
		 * later after fio_idle_prof_start. 
		 */	
		if (ipt->state == TD_EXITED)
			ipc.status = IDLE_PROF_STATUS_ABORT;
	}

	if (ipc.status != IDLE_PROF_STATUS_ABORT)
		calibration_stats();
	else
		ipc.cali_mean = ipc.cali_stddev = 0.0;

	if (ipc.opt == IDLE_PROF_OPT_CALI)
		ipc.status = IDLE_PROF_STATUS_CALI_STOP;
}

void fio_idle_prof_start(void)
{
	int i;
	struct idle_prof_thread *ipt;

	if (ipc.opt == IDLE_PROF_OPT_NONE)
		return;

	/* unlock regardless abort is set or not */
	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		pthread_mutex_unlock(&ipt->start_lock);
	}
}

void fio_idle_prof_stop(void)
{
	int i;
	uint64_t runt;
	struct timeval tp;
	struct timespec ts;
	struct idle_prof_thread *ipt;

	if (ipc.opt == IDLE_PROF_OPT_NONE)
		return;

	if (ipc.opt == IDLE_PROF_OPT_CALI)
		return;

	ipc.status = IDLE_PROF_STATUS_PROF_STOP;

	/* wait for all threads to exit from profiling */
	for (i = 0; i < ipc.nr_cpus; i++) {
		ipt = &ipc.ipts[i];
		pthread_mutex_lock(&ipt->start_lock);
		while ((ipt->state != TD_EXITED) &&
		       (ipt->state!=TD_NOT_CREATED)) {
			fio_gettime(&tp, NULL);
			ts.tv_sec = tp.tv_sec + 1;
			ts.tv_nsec = tp.tv_usec * 1000;
			/* timed wait in case a signal is not received */
			pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts);
		}
		pthread_mutex_unlock(&ipt->start_lock);

		/* calculate idleness */
		if (ipc.cali_mean != 0.0) {
			runt = utime_since(&ipt->tps, &ipt->tpe);
			if (runt)
				ipt->idleness = ipt->loops * ipc.cali_mean / runt;
			else
				ipt->idleness = 0.0;
		} else
			ipt->idleness = 0.0;
	}

	/*
	 * memory allocations are freed via explicit fio_idle_prof_cleanup
	 * after profiling stats are collected by apps.  
	 */
}

/*
 * return system idle percentage when cpu is -1;
 * return one cpu idle percentage otherwise.
 */
static double fio_idle_prof_cpu_stat(int cpu)
{
	int i, nr_cpus = ipc.nr_cpus;
	struct idle_prof_thread *ipt;
	double p = 0.0;

	if (ipc.opt == IDLE_PROF_OPT_NONE)
		return 0.0;

	if ((cpu >= nr_cpus) || (cpu < -1)) {
		log_err("fio: idle profiling invalid cpu index\n");
		return 0.0;
	}

	if (cpu == -1) {
		for (i = 0; i < nr_cpus; i++) {
			ipt = &ipc.ipts[i];
			p += ipt->idleness;
		}
		p /= nr_cpus;
	} else {
		ipt = &ipc.ipts[cpu];
		p = ipt->idleness;
	}

	return p * 100.0;
}

static void fio_idle_prof_cleanup(void)
{
	if (ipc.ipts) {
		free(ipc.ipts);
		ipc.ipts = NULL;
	}

	if (ipc.buf) {
		free(ipc.buf);
		ipc.buf = NULL;
	}
}

int fio_idle_prof_parse_opt(const char *args)
{
	ipc.opt = IDLE_PROF_OPT_NONE; /* default */

	if (!args) {
		log_err("fio: empty idle-prof option string\n");
		return -1;
	}	

#if defined(FIO_HAVE_CPU_AFFINITY) && defined(CONFIG_SCHED_IDLE)
	if (strcmp("calibrate", args) == 0) {
		ipc.opt = IDLE_PROF_OPT_CALI;
		fio_idle_prof_init();
		fio_idle_prof_start();
		fio_idle_prof_stop();
		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL);
		return 1;
	} else if (strcmp("system", args) == 0) {
		ipc.opt = IDLE_PROF_OPT_SYSTEM;
		return 0;
	} else if (strcmp("percpu", args) == 0) {
		ipc.opt = IDLE_PROF_OPT_PERCPU;
		return 0;
	} else {
		log_err("fio: incorrect idle-prof option: %s\n", args);
		return -1;
	}	
#else
	log_err("fio: idle-prof not supported on this platform\n");
	return -1;
#endif
}

void show_idle_prof_stats(int output, struct json_object *parent)
{
	int i, nr_cpus = ipc.nr_cpus;
	struct json_object *tmp;
	char s[MAX_CPU_STR_LEN];

	if (output == FIO_OUTPUT_NORMAL) {
		if (ipc.opt > IDLE_PROF_OPT_CALI)
			log_info("\nCPU idleness:\n");
		else if (ipc.opt == IDLE_PROF_OPT_CALI)
			log_info("CPU idleness:\n");

		if (ipc.opt >= IDLE_PROF_OPT_SYSTEM)
			log_info("  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));

		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
			log_info("  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
			for (i = 1; i < nr_cpus; i++)
				log_info(", %3.2f%%", fio_idle_prof_cpu_stat(i));
			log_info("\n");
		}

		if (ipc.opt >= IDLE_PROF_OPT_CALI) {
			log_info("  unit work: mean=%3.2fus,", ipc.cali_mean);
			log_info(" stddev=%3.2f\n", ipc.cali_stddev);
		}

		/* dynamic mem allocations can now be freed */
		if (ipc.opt != IDLE_PROF_OPT_NONE)
			fio_idle_prof_cleanup();

		return;
	}

	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output == FIO_OUTPUT_JSON)) {
		if (!parent)
			return;

		tmp = json_create_object();
		if (!tmp)
			return;

		json_object_add_value_object(parent, "cpu_idleness", tmp);
		json_object_add_value_float(tmp, "system", fio_idle_prof_cpu_stat(-1));

		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
			for (i = 0; i < nr_cpus; i++) {
				snprintf(s, MAX_CPU_STR_LEN, "cpu-%d", i);
				json_object_add_value_float(tmp, s, fio_idle_prof_cpu_stat(i));
			}
		}

		json_object_add_value_float(tmp, "unit_mean", ipc.cali_mean);
		json_object_add_value_float(tmp, "unit_stddev", ipc.cali_stddev);
		
		fio_idle_prof_cleanup();
	}
}