/*
 *    Hypervisor filesystem for Linux on s390. Diag 204 and 224
 *    implementation.
 *
 *    Copyright IBM Corp. 2006, 2008
 *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
 */

#define KMSG_COMPONENT "hypfs"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <asm/ebcdic.h>
#include "hypfs.h"

#define LPAR_NAME_LEN 8		/* lpar name len in diag 204 data */
#define CPU_NAME_LEN 16		/* type name len of cpus in diag224 name table */
#define TMP_SIZE 64		/* size of temporary buffers */

#define DBFS_D204_HDR_VERSION	0

/* diag 204 subcodes */
enum diag204_sc {
	SUBC_STIB4 = 4,
	SUBC_RSI = 5,
	SUBC_STIB6 = 6,
	SUBC_STIB7 = 7
};

/* The two available diag 204 data formats */
enum diag204_format {
	INFO_SIMPLE = 0,
	INFO_EXT = 0x00010000
};

/* bit is set in flags, when physical cpu info is included in diag 204 data */
#define LPAR_PHYS_FLG  0x80

static char *diag224_cpu_names;			/* diag 224 name table */
static enum diag204_sc diag204_store_sc;	/* used subcode for store */
static enum diag204_format diag204_info_type;	/* used diag 204 data format */

static void *diag204_buf;		/* 4K aligned buffer for diag204 data */
static void *diag204_buf_vmalloc;	/* vmalloc pointer for diag204 data */
static int diag204_buf_pages;		/* number of pages for diag204 data */

static struct dentry *dbfs_d204_file;

/*
 * DIAG 204 data structures and member access functions.
 *
 * Since we have two different diag 204 data formats for old and new s390
 * machines, we do not access the structs directly, but use getter functions for
 * each struct member instead. This should make the code more readable.
 */

/* Time information block */

struct info_blk_hdr {
	__u8  npar;
	__u8  flags;
	__u16 tslice;
	__u16 phys_cpus;
	__u16 this_part;
	__u64 curtod;
} __attribute__ ((packed));

struct x_info_blk_hdr {
	__u8  npar;
	__u8  flags;
	__u16 tslice;
	__u16 phys_cpus;
	__u16 this_part;
	__u64 curtod1;
	__u64 curtod2;
	char reserved[40];
} __attribute__ ((packed));

static inline int info_blk_hdr__size(enum diag204_format type)
{
	if (type == INFO_SIMPLE)
		return sizeof(struct info_blk_hdr);
	else /* INFO_EXT */
		return sizeof(struct x_info_blk_hdr);
}

static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct info_blk_hdr *)hdr)->npar;
	else /* INFO_EXT */
		return ((struct x_info_blk_hdr *)hdr)->npar;
}

static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct info_blk_hdr *)hdr)->flags;
	else /* INFO_EXT */
		return ((struct x_info_blk_hdr *)hdr)->flags;
}

static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct info_blk_hdr *)hdr)->phys_cpus;
	else /* INFO_EXT */
		return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
}

/* Partition header */

struct part_hdr {
	__u8 pn;
	__u8 cpus;
	char reserved[6];
	char part_name[LPAR_NAME_LEN];
} __attribute__ ((packed));

struct x_part_hdr {
	__u8  pn;
	__u8  cpus;
	__u8  rcpus;
	__u8  pflag;
	__u32 mlu;
	char  part_name[LPAR_NAME_LEN];
	char  lpc_name[8];
	char  os_name[8];
	__u64 online_cs;
	__u64 online_es;
	__u8  upid;
	char  reserved1[3];
	__u32 group_mlu;
	char  group_name[8];
	char  reserved2[32];
} __attribute__ ((packed));

static inline int part_hdr__size(enum diag204_format type)
{
	if (type == INFO_SIMPLE)
		return sizeof(struct part_hdr);
	else /* INFO_EXT */
		return sizeof(struct x_part_hdr);
}

static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct part_hdr *)hdr)->cpus;
	else /* INFO_EXT */
		return ((struct x_part_hdr *)hdr)->rcpus;
}

static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
				       char *name)
{
	if (type == INFO_SIMPLE)
		memcpy(name, ((struct part_hdr *)hdr)->part_name,
		       LPAR_NAME_LEN);
	else /* INFO_EXT */
		memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
		       LPAR_NAME_LEN);
	EBCASC(name, LPAR_NAME_LEN);
	name[LPAR_NAME_LEN] = 0;
	strim(name);
}

struct cpu_info {
	__u16 cpu_addr;
	char  reserved1[2];
	__u8  ctidx;
	__u8  cflag;
	__u16 weight;
	__u64 acc_time;
	__u64 lp_time;
} __attribute__ ((packed));

struct x_cpu_info {
	__u16 cpu_addr;
	char  reserved1[2];
	__u8  ctidx;
	__u8  cflag;
	__u16 weight;
	__u64 acc_time;
	__u64 lp_time;
	__u16 min_weight;
	__u16 cur_weight;
	__u16 max_weight;
	char  reseved2[2];
	__u64 online_time;
	__u64 wait_time;
	__u32 pma_weight;
	__u32 polar_weight;
	char  reserved3[40];
} __attribute__ ((packed));

/* CPU info block */

static inline int cpu_info__size(enum diag204_format type)
{
	if (type == INFO_SIMPLE)
		return sizeof(struct cpu_info);
	else /* INFO_EXT */
		return sizeof(struct x_cpu_info);
}

static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct cpu_info *)hdr)->ctidx;
	else /* INFO_EXT */
		return ((struct x_cpu_info *)hdr)->ctidx;
}

static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct cpu_info *)hdr)->cpu_addr;
	else /* INFO_EXT */
		return ((struct x_cpu_info *)hdr)->cpu_addr;
}

static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct cpu_info *)hdr)->acc_time;
	else /* INFO_EXT */
		return ((struct x_cpu_info *)hdr)->acc_time;
}

static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct cpu_info *)hdr)->lp_time;
	else /* INFO_EXT */
		return ((struct x_cpu_info *)hdr)->lp_time;
}

static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return 0;	/* online_time not available in simple info */
	else /* INFO_EXT */
		return ((struct x_cpu_info *)hdr)->online_time;
}

/* Physical header */

struct phys_hdr {
	char reserved1[1];
	__u8 cpus;
	char reserved2[6];
	char mgm_name[8];
} __attribute__ ((packed));

struct x_phys_hdr {
	char reserved1[1];
	__u8 cpus;
	char reserved2[6];
	char mgm_name[8];
	char reserved3[80];
} __attribute__ ((packed));

static inline int phys_hdr__size(enum diag204_format type)
{
	if (type == INFO_SIMPLE)
		return sizeof(struct phys_hdr);
	else /* INFO_EXT */
		return sizeof(struct x_phys_hdr);
}

static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct phys_hdr *)hdr)->cpus;
	else /* INFO_EXT */
		return ((struct x_phys_hdr *)hdr)->cpus;
}

/* Physical CPU info block */

struct phys_cpu {
	__u16 cpu_addr;
	char  reserved1[2];
	__u8  ctidx;
	char  reserved2[3];
	__u64 mgm_time;
	char  reserved3[8];
} __attribute__ ((packed));

struct x_phys_cpu {
	__u16 cpu_addr;
	char  reserved1[2];
	__u8  ctidx;
	char  reserved2[3];
	__u64 mgm_time;
	char  reserved3[80];
} __attribute__ ((packed));

static inline int phys_cpu__size(enum diag204_format type)
{
	if (type == INFO_SIMPLE)
		return sizeof(struct phys_cpu);
	else /* INFO_EXT */
		return sizeof(struct x_phys_cpu);
}

static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct phys_cpu *)hdr)->cpu_addr;
	else /* INFO_EXT */
		return ((struct x_phys_cpu *)hdr)->cpu_addr;
}

static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct phys_cpu *)hdr)->mgm_time;
	else /* INFO_EXT */
		return ((struct x_phys_cpu *)hdr)->mgm_time;
}

static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
{
	if (type == INFO_SIMPLE)
		return ((struct phys_cpu *)hdr)->ctidx;
	else /* INFO_EXT */
		return ((struct x_phys_cpu *)hdr)->ctidx;
}

/* Diagnose 204 functions */

static int diag204(unsigned long subcode, unsigned long size, void *addr)
{
	register unsigned long _subcode asm("0") = subcode;
	register unsigned long _size asm("1") = size;

	asm volatile(
		"	diag	%2,%0,0x204\n"
		"0:\n"
		EX_TABLE(0b,0b)
		: "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
	if (_subcode)
		return -1;
	return _size;
}

/*
 * For the old diag subcode 4 with simple data format we have to use real
 * memory. If we use subcode 6 or 7 with extended data format, we can (and
 * should) use vmalloc, since we need a lot of memory in that case. Currently
 * up to 93 pages!
 */

static void diag204_free_buffer(void)
{
	if (!diag204_buf)
		return;
	if (diag204_buf_vmalloc) {
		vfree(diag204_buf_vmalloc);
		diag204_buf_vmalloc = NULL;
	} else {
		free_pages((unsigned long) diag204_buf, 0);
	}
	diag204_buf = NULL;
}

static void *page_align_ptr(void *ptr)
{
	return (void *) PAGE_ALIGN((unsigned long) ptr);
}

static void *diag204_alloc_vbuf(int pages)
{
	/* The buffer has to be page aligned! */
	diag204_buf_vmalloc = vmalloc(PAGE_SIZE * (pages + 1));
	if (!diag204_buf_vmalloc)
		return ERR_PTR(-ENOMEM);
	diag204_buf = page_align_ptr(diag204_buf_vmalloc);
	diag204_buf_pages = pages;
	return diag204_buf;
}

static void *diag204_alloc_rbuf(void)
{
	diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0);
	if (!diag204_buf)
		return ERR_PTR(-ENOMEM);
	diag204_buf_pages = 1;
	return diag204_buf;
}

static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
{
	if (diag204_buf) {
		*pages = diag204_buf_pages;
		return diag204_buf;
	}
	if (fmt == INFO_SIMPLE) {
		*pages = 1;
		return diag204_alloc_rbuf();
	} else {/* INFO_EXT */
		*pages = diag204((unsigned long)SUBC_RSI |
				 (unsigned long)INFO_EXT, 0, NULL);
		if (*pages <= 0)
			return ERR_PTR(-ENOSYS);
		else
			return diag204_alloc_vbuf(*pages);
	}
}

/*
 * diag204_probe() has to find out, which type of diagnose 204 implementation
 * we have on our machine. Currently there are three possible scanarios:
 *   - subcode 4   + simple data format (only one page)
 *   - subcode 4-6 + extended data format
 *   - subcode 4-7 + extended data format
 *
 * Subcode 5 is used to retrieve the size of the data, provided by subcodes
 * 6 and 7. Subcode 7 basically has the same function as subcode 6. In addition
 * to subcode 6 it provides also information about secondary cpus.
 * In order to get as much information as possible, we first try
 * subcode 7, then 6 and if both fail, we use subcode 4.
 */

static int diag204_probe(void)
{
	void *buf;
	int pages, rc;

	buf = diag204_get_buffer(INFO_EXT, &pages);
	if (!IS_ERR(buf)) {
		if (diag204((unsigned long)SUBC_STIB7 |
			    (unsigned long)INFO_EXT, pages, buf) >= 0) {
			diag204_store_sc = SUBC_STIB7;
			diag204_info_type = INFO_EXT;
			goto out;
		}
		if (diag204((unsigned long)SUBC_STIB6 |
			    (unsigned long)INFO_EXT, pages, buf) >= 0) {
			diag204_store_sc = SUBC_STIB6;
			diag204_info_type = INFO_EXT;
			goto out;
		}
		diag204_free_buffer();
	}

	/* subcodes 6 and 7 failed, now try subcode 4 */

	buf = diag204_get_buffer(INFO_SIMPLE, &pages);
	if (IS_ERR(buf)) {
		rc = PTR_ERR(buf);
		goto fail_alloc;
	}
	if (diag204((unsigned long)SUBC_STIB4 |
		    (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
		diag204_store_sc = SUBC_STIB4;
		diag204_info_type = INFO_SIMPLE;
		goto out;
	} else {
		rc = -ENOSYS;
		goto fail_store;
	}
out:
	rc = 0;
fail_store:
	diag204_free_buffer();
fail_alloc:
	return rc;
}

static int diag204_do_store(void *buf, int pages)
{
	int rc;

	rc = diag204((unsigned long) diag204_store_sc |
		     (unsigned long) diag204_info_type, pages, buf);
	return rc < 0 ? -ENOSYS : 0;
}

static void *diag204_store(void)
{
	void *buf;
	int pages, rc;

	buf = diag204_get_buffer(diag204_info_type, &pages);
	if (IS_ERR(buf))
		goto out;
	rc = diag204_do_store(buf, pages);
	if (rc)
		return ERR_PTR(rc);
out:
	return buf;
}

/* Diagnose 224 functions */

static int diag224(void *ptr)
{
	int rc = -EOPNOTSUPP;

	asm volatile(
		"	diag	%1,%2,0x224\n"
		"0:	lhi	%0,0x0\n"
		"1:\n"
		EX_TABLE(0b,1b)
		: "+d" (rc) :"d" (0), "d" (ptr) : "memory");
	return rc;
}

static int diag224_get_name_table(void)
{
	/* memory must be below 2GB */
	diag224_cpu_names = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
	if (!diag224_cpu_names)
		return -ENOMEM;
	if (diag224(diag224_cpu_names)) {
		kfree(diag224_cpu_names);
		return -EOPNOTSUPP;
	}
	EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
	return 0;
}

static void diag224_delete_name_table(void)
{
	kfree(diag224_cpu_names);
}

static int diag224_idx2name(int index, char *name)
{
	memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
		CPU_NAME_LEN);
	name[CPU_NAME_LEN] = 0;
	strim(name);
	return 0;
}

struct dbfs_d204_hdr {
	u64	len;		/* Length of d204 buffer without header */
	u16	version;	/* Version of header */
	u8	sc;		/* Used subcode */
	char	reserved[53];
} __attribute__ ((packed));

struct dbfs_d204 {
	struct dbfs_d204_hdr	hdr;	/* 64 byte header */
	char			buf[];	/* d204 buffer */
} __attribute__ ((packed));

static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size)
{
	struct dbfs_d204 *d204;
	int rc, buf_size;
	void *base;

	buf_size = PAGE_SIZE * (diag204_buf_pages + 1) + sizeof(d204->hdr);
	base = vzalloc(buf_size);
	if (!base)
		return -ENOMEM;
	d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr);
	rc = diag204_do_store(d204->buf, diag204_buf_pages);
	if (rc) {
		vfree(base);
		return rc;
	}
	d204->hdr.version = DBFS_D204_HDR_VERSION;
	d204->hdr.len = PAGE_SIZE * diag204_buf_pages;
	d204->hdr.sc = diag204_store_sc;
	*data = d204;
	*data_free_ptr = base;
	*size = d204->hdr.len + sizeof(struct dbfs_d204_hdr);
	return 0;
}

static struct hypfs_dbfs_file dbfs_file_d204 = {
	.name		= "diag_204",
	.data_create	= dbfs_d204_create,
	.data_free	= vfree,
};

__init int hypfs_diag_init(void)
{
	int rc;

	if (diag204_probe()) {
		pr_err("The hardware system does not support hypfs\n");
		return -ENODATA;
	}
	if (diag204_info_type == INFO_EXT) {
		rc = hypfs_dbfs_create_file(&dbfs_file_d204);
		if (rc)
			return rc;
	}
	if (MACHINE_IS_LPAR) {
		rc = diag224_get_name_table();
		if (rc) {
			pr_err("The hardware system does not provide all "
			       "functions required by hypfs\n");
			debugfs_remove(dbfs_d204_file);
			return rc;
		}
	}
	return 0;
}

void hypfs_diag_exit(void)
{
	debugfs_remove(dbfs_d204_file);
	diag224_delete_name_table();
	diag204_free_buffer();
	hypfs_dbfs_remove_file(&dbfs_file_d204);
}

/*
 * Functions to create the directory structure
 * *******************************************
 */

static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
{
	struct dentry *cpu_dir;
	char buffer[TMP_SIZE];
	void *rc;

	snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type,
							    cpu_info));
	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
	rc = hypfs_create_u64(cpu_dir, "mgmtime",
			      cpu_info__acc_time(diag204_info_type, cpu_info) -
			      cpu_info__lp_time(diag204_info_type, cpu_info));
	if (IS_ERR(rc))
		return PTR_ERR(rc);
	rc = hypfs_create_u64(cpu_dir, "cputime",
			      cpu_info__lp_time(diag204_info_type, cpu_info));
	if (IS_ERR(rc))
		return PTR_ERR(rc);
	if (diag204_info_type == INFO_EXT) {
		rc = hypfs_create_u64(cpu_dir, "onlinetime",
				      cpu_info__online_time(diag204_info_type,
							    cpu_info));
		if (IS_ERR(rc))
			return PTR_ERR(rc);
	}
	diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer);
	rc = hypfs_create_str(cpu_dir, "type", buffer);
	return PTR_RET(rc);
}

static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
{
	struct dentry *cpus_dir;
	struct dentry *lpar_dir;
	char lpar_name[LPAR_NAME_LEN + 1];
	void *cpu_info;
	int i;

	part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
	lpar_name[LPAR_NAME_LEN] = 0;
	lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
	if (IS_ERR(lpar_dir))
		return lpar_dir;
	cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
	if (IS_ERR(cpus_dir))
		return cpus_dir;
	cpu_info = part_hdr + part_hdr__size(diag204_info_type);
	for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) {
		int rc;
		rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
		if (rc)
			return ERR_PTR(rc);
		cpu_info += cpu_info__size(diag204_info_type);
	}
	return cpu_info;
}

static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
{
	struct dentry *cpu_dir;
	char buffer[TMP_SIZE];
	void *rc;

	snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type,
							    cpu_info));
	cpu_dir = hypfs_mkdir(cpus_dir, buffer);
	if (IS_ERR(cpu_dir))
		return PTR_ERR(cpu_dir);
	rc = hypfs_create_u64(cpu_dir, "mgmtime",
			      phys_cpu__mgm_time(diag204_info_type, cpu_info));
	if (IS_ERR(rc))
		return PTR_ERR(rc);
	diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer);
	rc = hypfs_create_str(cpu_dir, "type", buffer);
	return PTR_RET(rc);
}

static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
{
	int i;
	void *cpu_info;
	struct dentry *cpus_dir;

	cpus_dir = hypfs_mkdir(parent_dir, "cpus");
	if (IS_ERR(cpus_dir))
		return cpus_dir;
	cpu_info = phys_hdr + phys_hdr__size(diag204_info_type);
	for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) {
		int rc;
		rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
		if (rc)
			return ERR_PTR(rc);
		cpu_info += phys_cpu__size(diag204_info_type);
	}
	return cpu_info;
}

int hypfs_diag_create_files(struct dentry *root)
{
	struct dentry *systems_dir, *hyp_dir;
	void *time_hdr, *part_hdr;
	int i, rc;
	void *buffer, *ptr;

	buffer = diag204_store();
	if (IS_ERR(buffer))
		return PTR_ERR(buffer);

	systems_dir = hypfs_mkdir(root, "systems");
	if (IS_ERR(systems_dir)) {
		rc = PTR_ERR(systems_dir);
		goto err_out;
	}
	time_hdr = (struct x_info_blk_hdr *)buffer;
	part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type);
	for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) {
		part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
		if (IS_ERR(part_hdr)) {
			rc = PTR_ERR(part_hdr);
			goto err_out;
		}
	}
	if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
		ptr = hypfs_create_phys_files(root, part_hdr);
		if (IS_ERR(ptr)) {
			rc = PTR_ERR(ptr);
			goto err_out;
		}
	}
	hyp_dir = hypfs_mkdir(root, "hyp");
	if (IS_ERR(hyp_dir)) {
		rc = PTR_ERR(hyp_dir);
		goto err_out;
	}
	ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
	if (IS_ERR(ptr)) {
		rc = PTR_ERR(ptr);
		goto err_out;
	}
	rc = 0;

err_out:
	return rc;
}