/*--------------------------------------------------------------------*/
/*--- The core dispatch loop, for jumping to a code address.       ---*/
/*---                                      dispatch-amd64-darwin.S ---*/
/*--------------------------------------------------------------------*/

/*
  This file is part of Valgrind, a dynamic binary instrumentation
  framework.

  Copyright (C) 2000-2011 Julian Seward 
     jseward@acm.org

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of the
  License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  02111-1307, USA.

  The GNU General Public License is contained in the file COPYING.
*/

#if defined(VGP_amd64_darwin)

#include "pub_core_basics_asm.h"
#include "pub_core_dispatch_asm.h"
#include "pub_core_transtab_asm.h"
#include "libvex_guest_offsets.h"	/* for OFFSET_amd64_RIP */


/*------------------------------------------------------------*/
/*---                                                      ---*/
/*--- The dispatch loop.  VG_(run_innerloop) is used to    ---*/
/*--- run all translations except no-redir ones.           ---*/
/*---                                                      ---*/
/*------------------------------------------------------------*/

/*----------------------------------------------------*/
/*--- Preamble (set everything up)                 ---*/
/*----------------------------------------------------*/

/* signature:
UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
*/

.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
	/* %rdi holds guest_state */
	/* %rsi holds do_profiling */
	
	/* ----- entry point to VG_(run_innerloop) ----- */
	pushq	%rbx
	pushq	%rcx
	pushq	%rdx
	pushq	%rsi
	pushq	%rbp
	pushq	%r8
	pushq	%r9
	pushq	%r10
	pushq	%r11
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15
	pushq	%rdi  /* guest_state */

	movq	VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
	movl	(%r15), %r15d
	pushq	%r15

	/* 8(%rsp) holds cached copy of guest_state ptr */
	/* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */

	/* Set up the guest state pointer */
	movq	%rdi, %rbp
	
	/* fetch %RIP into %rax */
	movq	OFFSET_amd64_RIP(%rbp), %rax

	/* set host FPU control word to the default mode expected 
           by VEX-generated code.  See comments in libvex.h for
           more info. */
	finit
	pushq	$0x027F
	fldcw	(%rsp)
	addq	$8, %rsp
	
	/* set host SSE control word to the default mode expected 
	   by VEX-generated code. */
	pushq	$0x1F80
	ldmxcsr	(%rsp)
	addq	$8, %rsp

	/* set dir flag to known value */
	cld

	/* fall into main loop  (the right one) */
	cmpq	$0, %rsi
	je	VG_(run_innerloop__dispatch_unassisted_unprofiled)
	jmp	VG_(run_innerloop__dispatch_unassisted_profiled)
	/*NOTREACHED*/	

/*----------------------------------------------------*/
/*--- NO-PROFILING (standard) dispatcher           ---*/
/*----------------------------------------------------*/

.align	4
.globl	VG_(run_innerloop__dispatch_unassisted_unprofiled)
VG_(run_innerloop__dispatch_unassisted_unprofiled):
	/* AT ENTRY: %rax is next guest addr, %rbp is the
           unmodified guest state ptr */

	/* save the jump address in the guest state */
	movq	%rax, OFFSET_amd64_RIP(%rbp)

	/* Are we out of timeslice?  If yes, defer to scheduler. */
	subl	$1, 0(%rsp)
	jz	counter_is_zero

	/* try a fast lookup in the translation cache */
	movabsq	$VG_(tt_fast), %rcx
	movq	%rax, %rbx
	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
	movq	0(%rcx,%rbx,1), %r10	/* .guest */
	movq	8(%rcx,%rbx,1), %r11	/* .host */
	cmpq	%rax, %r10
	jnz	fast_lookup_failed

        /* Found a match.  Jump to .host. */
	jmp 	*%r11
	ud2	/* persuade insn decoders not to speculate past here */
	/* generated code should run, then jump back to
	   VG_(run_innerloop__dispatch_{un,}assisted_unprofiled). */
	/*NOTREACHED*/

.align	4
.globl	VG_(run_innerloop__dispatch_assisted_unprofiled)
VG_(run_innerloop__dispatch_assisted_unprofiled):
	/* AT ENTRY: %rax is next guest addr, %rbp is the
           modified guest state ptr.  Since the GSP has changed,
           jump directly to gsp_changed. */
        jmp     gsp_changed
        ud2
        /*NOTREACHED*/

/*----------------------------------------------------*/
/*--- PROFILING dispatcher (can be much slower)    ---*/
/*----------------------------------------------------*/

.align	4
.globl	VG_(run_innerloop__dispatch_unassisted_profiled)
VG_(run_innerloop__dispatch_unassisted_profiled):
	/* AT ENTRY: %rax is next guest addr, %rbp is the
           unmodified guest state ptr */

	/* save the jump address in the guest state */
	movq	%rax, OFFSET_amd64_RIP(%rbp)

	/* Are we out of timeslice?  If yes, defer to scheduler. */
	subl	$1, 0(%rsp)
	jz	counter_is_zero

	/* try a fast lookup in the translation cache */
        movabsq $VG_(tt_fast), %rcx
	movq	%rax, %rbx
	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
	movq	0(%rcx,%rbx,1), %r10	/* .guest */
	movq	8(%rcx,%rbx,1), %r11	/* .host */
	cmpq	%rax, %r10
	jnz	fast_lookup_failed

	/* increment bb profile counter */
	movabsq	$VG_(tt_fastN), %rdx
	shrq	$1, %rbx		/* entry# * sizeof(UInt*) */
	movq	(%rdx,%rbx,1), %rdx
	addl	$1, (%rdx)

        /* Found a match.  Jump to .host. */
	jmp 	*%r11
	ud2	/* persuade insn decoders not to speculate past here */
	/* generated code should run, then jump back to
	   VG_(run_innerloop__dispatch_{un,}assisted_profiled). */
	/*NOTREACHED*/

.align	4
.globl	VG_(run_innerloop__dispatch_assisted_profiled)
VG_(run_innerloop__dispatch_assisted_profiled):
	/* AT ENTRY: %rax is next guest addr, %rbp is the
           modified guest state ptr.  Since the GSP has changed,
           jump directly to gsp_changed. */
        jmp     gsp_changed
        ud2
        /*NOTREACHED*/

/*----------------------------------------------------*/
/*--- exit points                                  ---*/
/*----------------------------------------------------*/

gsp_changed:
	/* Someone messed with the gsp.  Have to
           defer to scheduler to resolve this.  dispatch ctr
	   is not yet decremented, so no need to increment. */
	/* %RIP is NOT up to date here.  First, need to write
	   %rax back to %RIP, but without trashing %rbp since
	   that holds the value we want to return to the scheduler.
	   Hence use %r15 transiently for the guest state pointer. */
	movq	8(%rsp), %r15
	movq	%rax, OFFSET_amd64_RIP(%r15)
	movq	%rbp, %rax
	jmp	run_innerloop_exit
	/*NOTREACHED*/

counter_is_zero:
	/* %RIP is up to date here */
	/* back out decrement of the dispatch counter */
	addl	$1, 0(%rsp)
	movq	$VG_TRC_INNER_COUNTERZERO, %rax
	jmp	run_innerloop_exit

fast_lookup_failed:
	/* %RIP is up to date here */
	/* back out decrement of the dispatch counter */
	addl	$1, 0(%rsp)
	movq	$VG_TRC_INNER_FASTMISS, %rax
	jmp	run_innerloop_exit



/* All exits from the dispatcher go through here.  %rax holds
   the return value. 
*/
run_innerloop_exit: 
	/* We're leaving.  Check that nobody messed with
           %mxcsr or %fpucw.  We can't mess with %rax here as it
	   holds the tentative return value, but any other is OK. */
#if !defined(ENABLE_INNER)
        /* This check fails for self-hosting, so skip in that case */
	pushq	$0
	fstcw	(%rsp)
	cmpl	$0x027F, (%rsp)
	popq	%r15 /* get rid of the word without trashing %eflags */
	jnz	invariant_violation
#endif
	pushq	$0
	stmxcsr	(%rsp)
	andl	$0xFFFFFFC0, (%rsp)  /* mask out status flags */
	cmpl	$0x1F80, (%rsp)
	popq	%r15
	jnz	invariant_violation
	/* otherwise we're OK */
	jmp	run_innerloop_exit_REALLY

invariant_violation:
	movq	$VG_TRC_INVARIANT_FAILED, %rax
	jmp	run_innerloop_exit_REALLY

run_innerloop_exit_REALLY:

	/* restore VG_(dispatch_ctr) */	
	popq	%r14
	movq	VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
	movl	%r14d, (%r15)

	popq	%rdi
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%r11
	popq	%r10
	popq	%r9
	popq	%r8
	popq	%rbp
	popq	%rsi
	popq	%rdx
	popq	%rcx
	popq	%rbx
	ret	

	
/*------------------------------------------------------------*/
/*---                                                      ---*/
/*--- A special dispatcher, for running no-redir           ---*/
/*--- translations.  Just runs the given translation once. ---*/
/*---                                                      ---*/
/*------------------------------------------------------------*/

/* signature:
void VG_(run_a_noredir_translation) ( UWord* argblock );
*/

/* Run a no-redir translation.  argblock points to 4 UWords, 2 to carry args
   and 2 to carry results:
      0: input:  ptr to translation
      1: input:  ptr to guest state
      2: output: next guest PC
      3: output: guest state pointer afterwards (== thread return code)
*/
.align 4
.globl VG_(run_a_noredir_translation)
VG_(run_a_noredir_translation):
	/* Save callee-saves regs */
	pushq %rbx
	pushq %rbp
	pushq %r12
	pushq %r13
	pushq %r14
	pushq %r15

	pushq %rdi  /* we will need it after running the translation */
	movq 8(%rdi), %rbp
	jmp *0(%rdi)
	/*NOTREACHED*/
	ud2
	/* If the translation has been correctly constructed, we
	should resume at the the following label. */
.globl VG_(run_a_noredir_translation__return_point)
VG_(run_a_noredir_translation__return_point):
	popq %rdi
	movq %rax, 16(%rdi)
	movq %rbp, 24(%rdi)

	popq  %r15
	popq  %r14
	popq  %r13
	popq  %r12
	popq  %rbp
	popq  %rbx
	ret

#endif // defined(VGP_amd64_darwin)

/*--------------------------------------------------------------------*/
/*--- end                                                          ---*/
/*--------------------------------------------------------------------*/