/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * (C) Copyright 2008 - 2013 Tensilica Inc.
 * (C) Copyright 2014 - 2016 Cadence Design Systems Inc.
 */

#include <config.h>
#include <asm/asmmacro.h>
#include <asm/cacheasm.h>
#include <asm/regs.h>
#include <asm/arch/tie.h>
#include <asm-offsets.h>

/*
 * Offsets into the the pt_regs struture.
 * Make sure these always match with the structure defined in ptrace.h!
 */

#define PT_PC		0
#define PT_PS		4
#define PT_DEPC		8
#define PT_EXCCAUSE	12
#define PT_EXCVADDR	16
#define PT_DEBUGCAUSE	20
#define PT_WMASK	24
#define PT_LBEG		28
#define PT_LEND		32
#define PT_LCOUNT	36
#define PT_SAR		40
#define PT_WINDOWBASE	44
#define PT_WINDOWSTART	48
#define PT_SYSCALL	52
#define PT_ICOUNTLEVEL	56
#define PT_RESERVED	60
#define PT_AREG		64
#define PT_SIZE		(64 + 64)

/*
 * Cache attributes are different for full MMU and region protection.
 */

#if XCHAL_HAVE_PTP_MMU
#define CA_WRITEBACK	(0x7)
#else
#define CA_WRITEBACK	(0x4)
#endif

/*
 * Reset vector.
 * Only a trampoline to jump to _start
 * (Note that we have to mark the section writable as the section contains
 *  a relocatable literal)
 */

	.section .ResetVector.text, "awx"
	.global _ResetVector
_ResetVector:

	j	1f
	.align 4
2:	.long	_start
1:	l32r	a2, 2b
	jx	a2


/*
 * Processor initialization. We still run in rom space.
 *
 * NOTE: Running in ROM
 *  For Xtensa, we currently don't allow to run some code from ROM but
 *  unpack the data immediately to memory. This requires, for example,
 *  that DDR has been set up before running U-Boot. (See also comments
 *  inline for ways to change it)
 */

	.section .reset.text, "ax"
	.global _start
	.align 4
_start:
	/* Keep a0 = 0 for various initializations */

	movi	a0, 0

	/*
	 * For full MMU cores, put page table at unmapped virtual address.
	 * This ensures that accesses outside the static maps result
	 * in miss exceptions rather than random behaviour.
	 */

#if XCHAL_HAVE_PTP_MMU
	wsr	a0, PTEVADDR
#endif

	/* Disable dbreak debug exceptions */

#if XCHAL_HAVE_DEBUG && XCHAL_NUM_DBREAK > 0
	.set	_index, 0
	.rept	XCHAL_NUM_DBREAK
	wsr	a0, DBREAKC + _index
	.set	_index, _index + 1
	.endr
#endif

	/* Reset windowbase and windowstart */

#if XCHAL_HAVE_WINDOWED
	movi	a3, 1
	wsr	a3, windowstart
	wsr	a0, windowbase
	rsync
	movi	a0, 0			/* windowbase might have changed */
#endif

	/*
	 * Vecbase in bitstream may differ from header files
	 * set or check it.
	 */

#if XCHAL_HAVE_VECBASE
	movi	a3, XCHAL_VECBASE_RESET_VADDR	/* VECBASE reset value */
	wsr	a3, VECBASE
#endif

#if XCHAL_HAVE_LOOPS
	/* Disable loops */

	wsr	a0, LCOUNT
#endif

	/* Set PS.WOE = 0, PS.EXCM = 0 (for loop), PS.INTLEVEL = EXCM level */

#if XCHAL_HAVE_XEA1
	movi	a2, 1
#else
	movi	a2, XCHAL_EXCM_LEVEL
#endif
	wsr	a2, PS
	rsync

	/* Unlock and invalidate caches */

	___unlock_dcache_all a2, a3
	___invalidate_dcache_all a2, a3
	___unlock_icache_all a2, a3
	___invalidate_icache_all a2, a3

	isync

	/* Unpack data sections */

	movi	a2, __reloc_table_start
	movi	a3, __reloc_table_end

1:	beq	a2, a3, 3f	# no more entries?
	l32i	a4, a2, 0	# start destination (in RAM)
	l32i	a5, a2, 4	# end destination (in RAM)
	l32i	a6, a2, 8	# start source (in ROM)
	addi	a2, a2, 12	# next entry
	beq	a4, a5, 1b	# skip, empty entry
	beq	a4, a6, 1b	# skip, source and destination are the same

	/* If there's memory protection option with 512MB TLB regions and
	 * cache attributes in TLB entries and caching is not inhibited,
	 * enable data/instruction cache for relocated image.
	 */
#if XCHAL_HAVE_SPANNING_WAY && \
	(!defined(CONFIG_SYS_DCACHE_OFF) || \
	 !defined(CONFIG_SYS_ICACHE_OFF))
	srli	a7, a4, 29
	slli	a7, a7, 29
	addi	a7, a7, XCHAL_SPANNING_WAY
#ifndef CONFIG_SYS_DCACHE_OFF
	rdtlb1	a8, a7
	srli	a8, a8, 4
	slli	a8, a8, 4
	addi	a8, a8, CA_WRITEBACK
	wdtlb	a8, a7
#endif
#ifndef CONFIG_SYS_ICACHE_OFF
	ritlb1	a8, a7
	srli	a8, a8, 4
	slli	a8, a8, 4
	addi	a8, a8, CA_WRITEBACK
	witlb	a8, a7
#endif
	isync
#endif

2:	l32i	a7, a6, 0
	addi	a6, a6, 4
	s32i	a7, a4, 0
	addi	a4, a4, 4
	bltu	a4, a5, 2b
	j	1b

3:	/* All code and initalized data segments have been copied */

	/* Setup PS, PS.WOE = 1, PS.EXCM = 0, PS.INTLEVEL = EXCM level. */

#if __XTENSA_CALL0_ABI__
	movi	a2, XCHAL_EXCM_LEVEL
#else
	movi	a2, (1<<PS_WOE_BIT) | XCHAL_EXCM_LEVEL
#endif
	wsr	a2, PS
	rsync

	/* Writeback */

	___flush_dcache_all a2, a3

#ifdef __XTENSA_WINDOWED_ABI__
	/*
	 * In windowed ABI caller and call target need to be within the same
	 * gigabyte. Put the rest of the code into the text segment and jump
	 * there.
	 */

	movi	a4, .Lboard_init_code
	jx	a4

	.text
	.align	4
.Lboard_init_code:
#endif

	movi	a0, 0
	movi	sp, (XTENSA_SYS_TEXT_ADDR - 16) & 0xfffffff0

#ifdef CONFIG_DEBUG_UART
	movi	a4, debug_uart_init
#ifdef __XTENSA_CALL0_ABI__
	callx0	a4
#else
	callx4	a4
#endif
#endif

	movi	a4, board_init_f_alloc_reserve

#ifdef __XTENSA_CALL0_ABI__
	mov	a2, sp
	callx0	a4
	mov	sp, a2
#else
	mov	a6, sp
	callx4	a4
	movsp	sp, a6
#endif

	movi	a4, board_init_f_init_reserve

#ifdef __XTENSA_CALL0_ABI__
	callx0	a4
#else
	callx4	a4
#endif

        /*
	 * Call board initialization routine (never returns).
	 */

	movi	a4, board_init_f

#ifdef __XTENSA_CALL0_ABI__
	movi	a2, 0
	callx0	a4
#else
	movi	a6, 0
	callx4	a4
#endif
	/* Never Returns */
	ill

/*
 * void relocate_code (addr_sp, gd, addr_moni)
 *
 * This "function" does not return, instead it continues in RAM
 * after relocating the monitor code.
 *
 * a2 = addr_sp
 * a3 = gd
 * a4 = destination address
 */
	.text
	.globl relocate_code
	.align 4
relocate_code:
	abi_entry

#ifdef __XTENSA_CALL0_ABI__
	mov	a1, a2
	mov	a2, a3
	mov	a3, a4
	movi	a0, board_init_r
	callx0	a0
#else
	/* We can't movsp here, because the chain of stack frames may cross
	 * the now reserved memory. We need to toss all window frames except
	 * the current, create new pristine stack frame and start from scratch.
	 */
	rsr	a0, windowbase
	ssl	a0
	movi	a0, 1
	sll	a0, a0
	wsr	a0, windowstart
	rsync

	movi	a0, 0

	/* Reserve 16-byte save area */
	addi	sp, a2, -16
	mov	a6, a3
	mov	a7, a4
	movi	a4, board_init_r
	callx4	a4
#endif
	ill

#if XCHAL_HAVE_EXCEPTIONS

/*
 * Exception vectors.
 *
 *  Various notes:
 *   - We currently don't use the user exception vector (PS.UM is always 0),
 *     but do define such a vector, just in case. They both jump to the
 *     same exception handler, though.
 *   - We currently only save the bare minimum number of registers:
 *     a0...a15, sar, loop-registers, exception register (epc1, excvaddr,
 *     exccause, depc)
 *   - WINDOWSTART is only saved to identify if registers have been spilled
 *     to the wrong stack (exception stack) while executing the exception
 *     handler.
 */

	.section .KernelExceptionVector.text, "ax"
	.global _KernelExceptionVector
_KernelExceptionVector:

	wsr	a2, EXCSAVE1
	movi	a2, ExceptionHandler
	jx	a2

	.section .UserExceptionVector.text, "ax"
	.global _UserExceptionVector
_UserExceptionVector:

	wsr	a2, EXCSAVE1
	movi	a2, ExceptionHandler
	jx	a2

#if !XCHAL_HAVE_XEA1
	.section .DoubleExceptionVector.text, "ax"
	.global _DoubleExceptionVector
_DoubleExceptionVector:

#ifdef __XTENSA_CALL0_ABI__
	wsr	a0, EXCSAVE1
	movi    a0, hang                # report and ask user to reset board
	callx0	a0
#else
	wsr	a4, EXCSAVE1
	movi    a4, hang                # report and ask user to reset board
	callx4	a4
#endif
#endif
	/* Does not return here */


	.text
	.align 4
ExceptionHandler:

	rsr	a2, EXCCAUSE		# find handler

#if XCHAL_HAVE_WINDOWED
	/* Special case for alloca handler */

	bnei	a2, 5, 1f		# jump if not alloca exception

	addi	a1, a1, -16 - 4		# create a small stack frame
	s32i	a3, a1, 0		# and save a3 (a2 still in excsave1)
	movi	a2, fast_alloca_exception
	jx	a2			# jump to fast_alloca_exception
#endif
	/* All other exceptions go here: */

	/* Create ptrace stack and save a0...a3 */

1:	addi	a2, a1, - PT_SIZE - 16
	s32i	a0, a2, PT_AREG + 0 * 4
	s32i	a1, a2, PT_AREG + 1 * 4
	s32i	a3, a2, PT_AREG + 3 * 4
	rsr	a3, EXCSAVE1
	s32i	a3, a2, PT_AREG + 2 * 4
	mov	a1, a2

	/* Save remaining AR registers */

	s32i	a4, a1, PT_AREG + 4 * 4
	s32i	a5, a1, PT_AREG + 5 * 4
	s32i	a6, a1, PT_AREG + 6 * 4
	s32i	a7, a1, PT_AREG + 7 * 4
	s32i	a8, a1, PT_AREG + 8 * 4
	s32i	a9, a1, PT_AREG + 9 * 4
	s32i	a10, a1, PT_AREG + 10 * 4
	s32i	a11, a1, PT_AREG + 11 * 4
	s32i	a12, a1, PT_AREG + 12 * 4
	s32i	a13, a1, PT_AREG + 13 * 4
	s32i	a14, a1, PT_AREG + 14 * 4
	s32i	a15, a1, PT_AREG + 15 * 4

	/* Save SRs */

#if XCHAL_HAVE_WINDOWED
	rsr	a2, WINDOWSTART
	s32i	a2, a1, PT_WINDOWSTART
#endif

	rsr	a2, SAR
	rsr	a3, EPC1
	rsr	a4, EXCVADDR
	s32i	a2, a1, PT_SAR
	s32i	a3, a1, PT_PC
	s32i	a4, a1, PT_EXCVADDR

#if XCHAL_HAVE_LOOPS
	movi	a2, 0
	rsr	a3, LBEG
	xsr	a2, LCOUNT
	s32i	a3, a1, PT_LBEG
	rsr	a3, LEND
	s32i	a2, a1, PT_LCOUNT
	s32i	a3, a1, PT_LEND
#endif

	/* Set up C environment and call registered handler */
	/* Setup stack, PS.WOE = 1, PS.EXCM = 0, PS.INTLEVEL = EXCM level. */

	rsr	a2, EXCCAUSE
#if XCHAL_HAVE_XEA1
	movi	a3, (1<<PS_WOE_BIT) | 1
#elif __XTENSA_CALL0_ABI__
	movi	a3, XCHAL_EXCM_LEVEL
#else
	movi	a3, (1<<PS_WOE_BIT) | XCHAL_EXCM_LEVEL
#endif
	xsr	a3, PS
	rsync
	s32i	a2, a1, PT_EXCCAUSE
	s32i	a3, a1, PT_PS

	movi	a0, exc_table
	addx4	a0, a2, a0
	l32i	a0, a0, 0
#ifdef __XTENSA_CALL0_ABI__
	mov	a2, a1			# Provide stack frame as only argument
	callx0	a0
	l32i	a3, a1, PT_PS
#else
	mov	a6, a1			# Provide stack frame as only argument
	callx4	a0
#endif

	/* Restore PS and go to exception mode (PS.EXCM=1) */

	wsr	a3, PS

	/* Restore SR registers */

#if XCHAL_HAVE_LOOPS
	l32i	a2, a1, PT_LBEG
	l32i	a3, a1, PT_LEND
	l32i	a4, a1, PT_LCOUNT
	wsr	a2, LBEG
	wsr	a3, LEND
	wsr	a4, LCOUNT
#endif

	l32i	a2, a1, PT_SAR
	l32i	a3, a1, PT_PC
	wsr	a2, SAR
	wsr	a3, EPC1

#if XCHAL_HAVE_WINDOWED
	/* Do we need to simulate a MOVSP? */

	l32i	a2, a1, PT_WINDOWSTART
	addi	a3, a2, -1
	and	a2, a2, a3
	beqz	a2, 1f			# Skip if regs were spilled before exc.

	rsr	a2, WINDOWSTART
	addi	a3, a2, -1
	and	a2, a2, a3
	bnez	a2, 1f			# Skip if registers aren't spilled now

	addi	a2, a1, -16
	l32i	a4, a2, 0
	l32i	a5, a2, 4
	s32i	a4, a1, PT_SIZE + 0
	s32i	a5, a1, PT_SIZE + 4
	l32i	a4, a2, 8
	l32i	a5, a2, 12
	s32i	a4, a1, PT_SIZE + 8
	s32i	a5, a1, PT_SIZE + 12
#endif

	/* Restore address register */

1:	l32i	a15, a1, PT_AREG + 15 * 4
	l32i	a14, a1, PT_AREG + 14 * 4
	l32i	a13, a1, PT_AREG + 13 * 4
	l32i	a12, a1, PT_AREG + 12 * 4
	l32i	a11, a1, PT_AREG + 11 * 4
	l32i	a10, a1, PT_AREG + 10 * 4
	l32i	a9, a1, PT_AREG + 9 * 4
	l32i	a8, a1, PT_AREG + 8 * 4
	l32i	a7, a1, PT_AREG + 7 * 4
	l32i	a6, a1, PT_AREG + 6 * 4
	l32i	a5, a1, PT_AREG + 5 * 4
	l32i	a4, a1, PT_AREG + 4 * 4
	l32i	a3, a1, PT_AREG + 3 * 4
	l32i	a2, a1, PT_AREG + 2 * 4
	l32i	a0, a1, PT_AREG + 0 * 4

	l32i	a1, a1, PT_AREG + 1 * 4 # Remove ptrace stack frame

	rfe

#endif /* XCHAL_HAVE_EXCEPTIONS */

#if XCHAL_HAVE_WINDOWED

/*
 * Window overflow and underflow handlers.
 * The handlers must be 64 bytes apart, first starting with the underflow
 * handlers underflow-4 to underflow-12, then the overflow handlers
 * overflow-4 to overflow-12.
 *
 * Note: We rerun the underflow handlers if we hit an exception, so
 *	 we try to access any page that would cause a page fault early.
 */

	.section .WindowVectors.text, "ax"

/* 4-Register Window Overflow Vector (Handler) */

	.align 64
.global _WindowOverflow4
_WindowOverflow4:
	s32e	a0, a5, -16
	s32e	a1, a5, -12
	s32e	a2, a5,  -8
	s32e	a3, a5,  -4
	rfwo


/* 4-Register Window Underflow Vector (Handler) */

	.align 64
.global _WindowUnderflow4
_WindowUnderflow4:
	l32e	a0, a5, -16
	l32e	a1, a5, -12
	l32e	a2, a5,  -8
	l32e	a3, a5,  -4
	rfwu

/*
 * a0:	a0
 * a1:	new stack pointer = a1 - 16 - 4
 * a2:	available, saved in excsave1
 * a3:	available, saved on stack *a1
 */

/* 15*/	.byte	0xff

fast_alloca_exception:	/* must be at _WindowUnderflow4 + 16 */

/* 16*/	rsr	a2, PS
/* 19*/	rsr	a3, WINDOWBASE
/* 22*/	extui	a2, a2, PS_OWB_SHIFT, PS_OWB_SHIFT
/* 25*/	xor	a2, a2, a3
/* 28*/	rsr	a3, PS
/* 31*/	slli	a2, a2, PS_OWB_SHIFT
/* 34*/	xor	a2, a3, a2
/* 37*/	wsr	a2, PS

/* 40*/	_l32i	a3, a1, 0
/* 43*/	addi	a1, a1, 16 + 4
/* 46*/	rsr	a2, EXCSAVE1

/* 49*/	rotw	-1
/* 52*/	_bbci.l	a4, 31, _WindowUnderflow4	/* 0x: call4 */
/* 55*/	rotw	-1
/* 58*/	_bbci.l	a8, 30, _WindowUnderflow8	/* 10: call8 */
/* 61*/ _j	__WindowUnderflow12		/* 11: call12 */
/* 64*/

/* 8-Register Window Overflow Vector (Handler) */

	.align 64
.global _WindowOverflow8
_WindowOverflow8:
	s32e	a0, a9, -16
	l32e	a0, a1, -12
	s32e	a2, a9,  -8
	s32e	a1, a9, -12
	s32e	a3, a9,  -4
	s32e	a4, a0, -32
	s32e	a5, a0, -28
	s32e	a6, a0, -24
	s32e	a7, a0, -20
	rfwo

/* 8-Register Window Underflow Vector (Handler) */

	.align 64
.global _WindowUnderflow8
_WindowUnderflow8:
	l32e	a1, a9, -12
	l32e	a0, a9, -16
	l32e	a7, a1, -12
	l32e	a2, a9,  -8
	l32e	a4, a7, -32
	l32e	a3, a9,  -4
	l32e	a5, a7, -28
	l32e	a6, a7, -24
	l32e	a7, a7, -20
	rfwu

/* 12-Register Window Overflow Vector (Handler) */

	.align 64
.global _WindowOverflow12
_WindowOverflow12:
	s32e	a0,  a13, -16
	l32e	a0,  a1,  -12
	s32e	a1,  a13, -12
	s32e	a2,  a13,  -8
	s32e	a3,  a13,  -4
	s32e	a4,  a0,  -48
	s32e	a5,  a0,  -44
	s32e	a6,  a0,  -40
	s32e	a7,  a0,  -36
	s32e	a8,  a0,  -32
	s32e	a9,  a0,  -28
	s32e	a10, a0,  -24
	s32e	a11, a0,  -20
	rfwo

/* 12-Register Window Underflow Vector (Handler) */

	.org _WindowOverflow12 + 64 - 3
__WindowUnderflow12:
	rotw	-1
.global _WindowUnderflow12
_WindowUnderflow12:
	l32e	a1,  a13, -12
	l32e	a0,  a13, -16
	l32e	a11, a1,  -12
	l32e	a2,  a13,  -8
	l32e	a4,  a11, -48
	l32e	a8,  a11, -32
	l32e	a3,  a13,  -4
	l32e	a5,  a11, -44
	l32e	a6,  a11, -40
	l32e	a7,  a11, -36
	l32e	a9,  a11, -28
	l32e	a10, a11, -24
	l32e	a11, a11, -20
	rfwu

#endif /* XCHAL_HAVE_WINDOWED */