/*
 * arch/metag/mm/cache.c
 *
 * Copyright (C) 2001, 2002, 2005, 2007, 2012 Imagination Technologies.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License version 2 as published by the
 * Free Software Foundation.
 *
 * Cache control code
 */

#include <linux/export.h>
#include <linux/io.h>
#include <asm/cacheflush.h>
#include <asm/core_reg.h>
#include <asm/global_lock.h>
#include <asm/metag_isa.h>
#include <asm/metag_mem.h>
#include <asm/metag_regs.h>

#define DEFAULT_CACHE_WAYS_LOG2	2

/*
 * Size of a set in the caches. Initialised for default 16K stride, adjusted
 * according to values passed through TBI global heap segment via LDLK (on ATP)
 * or config registers (on HTP/MTP)
 */
static int dcache_set_shift = METAG_TBI_CACHE_SIZE_BASE_LOG2
					- DEFAULT_CACHE_WAYS_LOG2;
static int icache_set_shift = METAG_TBI_CACHE_SIZE_BASE_LOG2
					- DEFAULT_CACHE_WAYS_LOG2;
/*
 * The number of sets in the caches. Initialised for HTP/ATP, adjusted
 * according to NOMMU setting in config registers
 */
static unsigned char dcache_sets_log2 = DEFAULT_CACHE_WAYS_LOG2;
static unsigned char icache_sets_log2 = DEFAULT_CACHE_WAYS_LOG2;

#ifndef CONFIG_METAG_META12
/**
 * metag_lnkget_probe() - Probe whether lnkget/lnkset go around the cache
 */
static volatile u32 lnkget_testdata[16] __initdata __aligned(64);

#define LNKGET_CONSTANT 0xdeadbeef

void __init metag_lnkget_probe(void)
{
	int temp;
	long flags;

	/*
	 * It's conceivable the user has configured a globally coherent cache
	 * shared with non-Linux hardware threads, so use LOCK2 to prevent them
	 * from executing and causing cache eviction during the test.
	 */
	__global_lock2(flags);

	/* read a value to bring it into the cache */
	(void)lnkget_testdata[0];
	lnkget_testdata[0] = 0;

	/* lnkget/lnkset it to modify it */
	asm volatile(
		"1:	LNKGETD %0, [%1]\n"
		"	LNKSETD [%1], %2\n"
		"	DEFR	%0, TXSTAT\n"
		"	ANDT	%0, %0, #HI(0x3f000000)\n"
		"	CMPT	%0, #HI(0x02000000)\n"
		"	BNZ	1b\n"
		: "=&d" (temp)
		: "da" (&lnkget_testdata[0]), "bd" (LNKGET_CONSTANT)
		: "cc");

	/* re-read it to see if the cached value changed */
	temp = lnkget_testdata[0];

	__global_unlock2(flags);

	/* flush the cache line to fix any incoherency */
	__builtin_dcache_flush((void *)&lnkget_testdata[0]);

#if defined(CONFIG_METAG_LNKGET_AROUND_CACHE)
	/* if the cache is right, LNKGET_AROUND_CACHE is unnecessary */
	if (temp == LNKGET_CONSTANT)
		pr_info("LNKGET/SET go through cache but CONFIG_METAG_LNKGET_AROUND_CACHE=y\n");
#elif defined(CONFIG_METAG_ATOMICITY_LNKGET)
	/*
	 * if the cache is wrong, LNKGET_AROUND_CACHE is really necessary
	 * because the kernel is configured to use LNKGET/SET for atomicity
	 */
	WARN(temp != LNKGET_CONSTANT,
	     "LNKGET/SET go around cache but CONFIG_METAG_LNKGET_AROUND_CACHE=n\n"
	     "Expect kernel failure as it's used for atomicity primitives\n");
#elif defined(CONFIG_SMP)
	/*
	 * if the cache is wrong, LNKGET_AROUND_CACHE should be used or the
	 * gateway page won't flush and userland could break.
	 */
	WARN(temp != LNKGET_CONSTANT,
	     "LNKGET/SET go around cache but CONFIG_METAG_LNKGET_AROUND_CACHE=n\n"
	     "Expect userland failure as it's used for user gateway page\n");
#else
	/*
	 * if the cache is wrong, LNKGET_AROUND_CACHE is set wrong, but it
	 * doesn't actually matter as it doesn't have any effect on !SMP &&
	 * !ATOMICITY_LNKGET.
	 */
	if (temp != LNKGET_CONSTANT)
		pr_warn("LNKGET/SET go around cache but CONFIG_METAG_LNKGET_AROUND_CACHE=n\n");
#endif
}
#endif /* !CONFIG_METAG_META12 */

/**
 * metag_cache_probe() - Probe L1 cache configuration.
 *
 * Probe the L1 cache configuration to aid the L1 physical cache flushing
 * functions.
 */
void __init metag_cache_probe(void)
{
#ifndef CONFIG_METAG_META12
	int coreid = metag_in32(METAC_CORE_ID);
	int config = metag_in32(METAC_CORE_CONFIG2);
	int cfgcache = coreid & METAC_COREID_CFGCACHE_BITS;

	if (cfgcache == METAC_COREID_CFGCACHE_TYPE0 ||
	    cfgcache == METAC_COREID_CFGCACHE_PRIVNOMMU) {
		icache_sets_log2 = 1;
		dcache_sets_log2 = 1;
	}

	/* For normal size caches, the smallest size is 4Kb.
	   For small caches, the smallest size is 64b */
	icache_set_shift = (config & METAC_CORECFG2_ICSMALL_BIT)
				? 6 : 12;
	icache_set_shift += (config & METAC_CORE_C2ICSZ_BITS)
				>> METAC_CORE_C2ICSZ_S;
	icache_set_shift -= icache_sets_log2;

	dcache_set_shift = (config & METAC_CORECFG2_DCSMALL_BIT)
				? 6 : 12;
	dcache_set_shift += (config & METAC_CORECFG2_DCSZ_BITS)
				>> METAC_CORECFG2_DCSZ_S;
	dcache_set_shift -= dcache_sets_log2;

	metag_lnkget_probe();
#else
	/* Extract cache sizes from global heap segment */
	unsigned long val, u;
	int width, shift, addend;
	PTBISEG seg;

	seg = __TBIFindSeg(NULL, TBID_SEG(TBID_THREAD_GLOBAL,
					  TBID_SEGSCOPE_GLOBAL,
					  TBID_SEGTYPE_HEAP));
	if (seg != NULL) {
		val = seg->Data[1];

		/* Work out width of I-cache size bit-field */
		u = ((unsigned long) METAG_TBI_ICACHE_SIZE_BITS)
		       >> METAG_TBI_ICACHE_SIZE_S;
		width = 0;
		while (u & 1) {
			width++;
			u >>= 1;
		}
		/* Extract sign-extended size addend value */
		shift = 32 - (METAG_TBI_ICACHE_SIZE_S + width);
		addend = (long) ((val & METAG_TBI_ICACHE_SIZE_BITS)
				 << shift)
			>> (shift + METAG_TBI_ICACHE_SIZE_S);
		/* Now calculate I-cache set size */
		icache_set_shift = (METAG_TBI_CACHE_SIZE_BASE_LOG2
				    - DEFAULT_CACHE_WAYS_LOG2)
					+ addend;

		/* Similarly for D-cache */
		u = ((unsigned long) METAG_TBI_DCACHE_SIZE_BITS)
		       >> METAG_TBI_DCACHE_SIZE_S;
		width = 0;
		while (u & 1) {
			width++;
			u >>= 1;
		}
		shift = 32 - (METAG_TBI_DCACHE_SIZE_S + width);
		addend = (long) ((val & METAG_TBI_DCACHE_SIZE_BITS)
				 << shift)
			>> (shift + METAG_TBI_DCACHE_SIZE_S);
		dcache_set_shift = (METAG_TBI_CACHE_SIZE_BASE_LOG2
				    - DEFAULT_CACHE_WAYS_LOG2)
					+ addend;
	}
#endif
}

static void metag_phys_data_cache_flush(const void *start)
{
	unsigned long flush0, flush1, flush2, flush3;
	int loops, step;
	int thread;
	int part, offset;
	int set_shift;

	/* Use a sequence of writes to flush the cache region requested */
	thread = (__core_reg_get(TXENABLE) & TXENABLE_THREAD_BITS)
					  >> TXENABLE_THREAD_S;

	/* Cache is broken into sets which lie in contiguous RAMs */
	set_shift = dcache_set_shift;

	/* Move to the base of the physical cache flush region */
	flush0 = LINSYSCFLUSH_DCACHE_LINE;
	step   = 64;

	/* Get partition data for this thread */
	part = metag_in32(SYSC_DCPART0 +
			      (SYSC_xCPARTn_STRIDE * thread));

	if ((int)start < 0)
		/* Access Global vs Local partition */
		part >>= SYSC_xCPARTG_AND_S
			- SYSC_xCPARTL_AND_S;

	/* Extract offset and move SetOff */
	offset = (part & SYSC_xCPARTL_OR_BITS)
			>> SYSC_xCPARTL_OR_S;
	flush0 += (offset << (set_shift - 4));

	/* Shrink size */
	part = (part & SYSC_xCPARTL_AND_BITS)
			>> SYSC_xCPARTL_AND_S;
	loops = ((part + 1) << (set_shift - 4));

	/* Reduce loops by step of cache line size */
	loops /= step;

	flush1 = flush0 + (1 << set_shift);
	flush2 = flush0 + (2 << set_shift);
	flush3 = flush0 + (3 << set_shift);

	if (dcache_sets_log2 == 1) {
		flush2 = flush1;
		flush3 = flush1 + step;
		flush1 = flush0 + step;
		step  <<= 1;
		loops >>= 1;
	}

	/* Clear loops ways in cache */
	while (loops-- != 0) {
		/* Clear the ways. */
#if 0
		/*
		 * GCC doesn't generate very good code for this so we
		 * provide inline assembly instead.
		 */
		metag_out8(0, flush0);
		metag_out8(0, flush1);
		metag_out8(0, flush2);
		metag_out8(0, flush3);

		flush0 += step;
		flush1 += step;
		flush2 += step;
		flush3 += step;
#else
		asm volatile (
			"SETB\t[%0+%4++],%5\n"
			"SETB\t[%1+%4++],%5\n"
			"SETB\t[%2+%4++],%5\n"
			"SETB\t[%3+%4++],%5\n"
			: "+e" (flush0),
			  "+e" (flush1),
			  "+e" (flush2),
			  "+e" (flush3)
			: "e" (step), "a" (0));
#endif
	}
}

void metag_data_cache_flush_all(const void *start)
{
	if ((metag_in32(SYSC_CACHE_MMU_CONFIG) & SYSC_CMMUCFG_DC_ON_BIT) == 0)
		/* No need to flush the data cache it's not actually enabled */
		return;

	metag_phys_data_cache_flush(start);
}

void metag_data_cache_flush(const void *start, int bytes)
{
	unsigned long flush0;
	int loops, step;

	if ((metag_in32(SYSC_CACHE_MMU_CONFIG) & SYSC_CMMUCFG_DC_ON_BIT) == 0)
		/* No need to flush the data cache it's not actually enabled */
		return;

	if (bytes >= 4096) {
		metag_phys_data_cache_flush(start);
		return;
	}

	/* Use linear cache flush mechanism on META IP */
	flush0 = (int)start;
	loops  = ((int)start & (DCACHE_LINE_BYTES - 1)) + bytes +
					(DCACHE_LINE_BYTES - 1);
	loops  >>= DCACHE_LINE_S;

#define PRIM_FLUSH(addr, offset) do {			\
	int __addr = ((int) (addr)) + ((offset) * 64);	\
	__builtin_dcache_flush((void *)(__addr));	\
	} while (0)

#define LOOP_INC (4*64)

	do {
		/* By default stop */
		step = 0;

		switch (loops) {
		/* Drop Thru Cases! */
		default:
			PRIM_FLUSH(flush0, 3);
			loops -= 4;
			step = 1;
		case 3:
			PRIM_FLUSH(flush0, 2);
		case 2:
			PRIM_FLUSH(flush0, 1);
		case 1:
			PRIM_FLUSH(flush0, 0);
			flush0 += LOOP_INC;
		case 0:
			break;
		}
	} while (step);
}
EXPORT_SYMBOL(metag_data_cache_flush);

static void metag_phys_code_cache_flush(const void *start, int bytes)
{
	unsigned long flush0, flush1, flush2, flush3, end_set;
	int loops, step;
	int thread;
	int set_shift, set_size;
	int part, offset;

	/* Use a sequence of writes to flush the cache region requested */
	thread = (__core_reg_get(TXENABLE) & TXENABLE_THREAD_BITS)
					  >> TXENABLE_THREAD_S;
	set_shift = icache_set_shift;

	/* Move to the base of the physical cache flush region */
	flush0 = LINSYSCFLUSH_ICACHE_LINE;
	step   = 64;

	/* Get partition code for this thread */
	part = metag_in32(SYSC_ICPART0 +
			  (SYSC_xCPARTn_STRIDE * thread));

	if ((int)start < 0)
		/* Access Global vs Local partition */
		part >>= SYSC_xCPARTG_AND_S-SYSC_xCPARTL_AND_S;

	/* Extract offset and move SetOff */
	offset = (part & SYSC_xCPARTL_OR_BITS)
			>> SYSC_xCPARTL_OR_S;
	flush0 += (offset << (set_shift - 4));

	/* Shrink size */
	part = (part & SYSC_xCPARTL_AND_BITS)
			>> SYSC_xCPARTL_AND_S;
	loops = ((part + 1) << (set_shift - 4));

	/* Where does the Set end? */
	end_set = flush0 + loops;
	set_size = loops;

#ifdef CONFIG_METAG_META12
	if ((bytes < 4096) && (bytes < loops)) {
		/* Unreachable on HTP/MTP */
		/* Only target the sets that could be relavent */
		flush0 += (loops - step) & ((int) start);
		loops = (((int) start) & (step-1)) + bytes + step - 1;
	}
#endif

	/* Reduce loops by step of cache line size */
	loops /= step;

	flush1 = flush0 + (1<<set_shift);
	flush2 = flush0 + (2<<set_shift);
	flush3 = flush0 + (3<<set_shift);

	if (icache_sets_log2 == 1) {
		flush2 = flush1;
		flush3 = flush1 + step;
		flush1 = flush0 + step;
#if 0
		/* flush0 will stop one line early in this case
		 * (flush1 will do the final line).
		 * However we don't correct end_set here at the moment
		 * because it will never wrap on HTP/MTP
		 */
		end_set -= step;
#endif
		step  <<= 1;
		loops >>= 1;
	}

	/* Clear loops ways in cache */
	while (loops-- != 0) {
#if 0
		/*
		 * GCC doesn't generate very good code for this so we
		 * provide inline assembly instead.
		 */
		/* Clear the ways */
		metag_out8(0, flush0);
		metag_out8(0, flush1);
		metag_out8(0, flush2);
		metag_out8(0, flush3);

		flush0 += step;
		flush1 += step;
		flush2 += step;
		flush3 += step;
#else
		asm volatile (
			"SETB\t[%0+%4++],%5\n"
			"SETB\t[%1+%4++],%5\n"
			"SETB\t[%2+%4++],%5\n"
			"SETB\t[%3+%4++],%5\n"
			: "+e" (flush0),
			  "+e" (flush1),
			  "+e" (flush2),
			  "+e" (flush3)
			: "e" (step), "a" (0));
#endif

		if (flush0 == end_set) {
			/* Wrap within Set 0 */
			flush0 -= set_size;
			flush1 -= set_size;
			flush2 -= set_size;
			flush3 -= set_size;
		}
	}
}

void metag_code_cache_flush_all(const void *start)
{
	if ((metag_in32(SYSC_CACHE_MMU_CONFIG) & SYSC_CMMUCFG_IC_ON_BIT) == 0)
		/* No need to flush the code cache it's not actually enabled */
		return;

	metag_phys_code_cache_flush(start, 4096);
}
EXPORT_SYMBOL(metag_code_cache_flush_all);

void metag_code_cache_flush(const void *start, int bytes)
{
#ifndef CONFIG_METAG_META12
	void *flush;
	int loops, step;
#endif /* !CONFIG_METAG_META12 */

	if ((metag_in32(SYSC_CACHE_MMU_CONFIG) & SYSC_CMMUCFG_IC_ON_BIT) == 0)
		/* No need to flush the code cache it's not actually enabled */
		return;

#ifdef CONFIG_METAG_META12
	/* CACHEWD isn't available on Meta1, so always do full cache flush */
	metag_phys_code_cache_flush(start, bytes);

#else /* CONFIG_METAG_META12 */
	/* If large size do full physical cache flush */
	if (bytes >= 4096) {
		metag_phys_code_cache_flush(start, bytes);
		return;
	}

	/* Use linear cache flush mechanism on META IP */
	flush = (void *)((int)start & ~(ICACHE_LINE_BYTES-1));
	loops  = ((int)start & (ICACHE_LINE_BYTES-1)) + bytes +
		(ICACHE_LINE_BYTES-1);
	loops  >>= ICACHE_LINE_S;

#define PRIM_IFLUSH(addr, offset) \
	__builtin_meta2_cachewd(((addr) + ((offset) * 64)), CACHEW_ICACHE_BIT)

#define LOOP_INC (4*64)

	do {
		/* By default stop */
		step = 0;

		switch (loops) {
		/* Drop Thru Cases! */
		default:
			PRIM_IFLUSH(flush, 3);
			loops -= 4;
			step = 1;
		case 3:
			PRIM_IFLUSH(flush, 2);
		case 2:
			PRIM_IFLUSH(flush, 1);
		case 1:
			PRIM_IFLUSH(flush, 0);
			flush += LOOP_INC;
		case 0:
			break;
		}
	} while (step);
#endif /* !CONFIG_METAG_META12 */
}
EXPORT_SYMBOL(metag_code_cache_flush);