/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * JNI method invocation.  This is used to call a C/C++ JNI method.  The
 * argument list has to be pushed onto the native stack according to
 * local calling conventions.
 *
 * This version supports the "new" ARM EABI.
 */

#include <machine/cpu-features.h>

#ifdef __ARM_EABI__

#ifdef EXTENDED_EABI_DEBUG
# define DBG
#else
# define DBG @
#endif


/*
Function prototype:

void dvmPlatformInvoke(void* pEnv, ClassObject* clazz, int argInfo, int argc,
    const u4* argv, const char* signature, void* func, JValue* pReturn)

The method we are calling has the form:

  return_type func(JNIEnv* pEnv, ClassObject* clazz, ...)
    -or-
  return_type func(JNIEnv* pEnv, Object* this, ...)

We receive a collection of 32-bit values which correspond to arguments from
the interpreter (e.g. float occupies one, double occupies two).  It's up to
us to convert these into local calling conventions.
*/

/*
ARM EABI notes:

r0-r3 hold first 4 args to a method
r9 is given special treatment in some situations, but not for us
r10 (sl) seems to be generally available
r11 (fp) is used by gcc (unless -fomit-frame-pointer is set)
r12 (ip) is scratch -- not preserved across method calls
r13 (sp) should be managed carefully in case a signal arrives
r14 (lr) must be preserved
r15 (pc) can be tinkered with directly

r0 holds returns of <= 4 bytes
r0-r1 hold returns of 8 bytes, low word in r0

Callee must save/restore r4+ (except r12) if it modifies them.

Stack is "full descending".  Only the arguments that don't fit in the first 4
registers are placed on the stack.  "sp" points at the first stacked argument
(i.e. the 5th arg).

VFP: single-precision results in s0, double-precision results in d0.

In the EABI, "sp" must be 64-bit aligned on entry to a function, and any
64-bit quantities (long long, double) must be 64-bit aligned.  This means
we have to scan the method signature, identify arguments that must be
padded, and fix them up appropriately.
*/

    .text
    .align  2
    .global dvmPlatformInvoke
    .type   dvmPlatformInvoke, %function

/*
 * On entry:
 *   r0  JNIEnv (can be left alone)
 *   r1  clazz (NULL for virtual method calls, non-NULL for static)
 *   r2  arg info
 *   r3  argc (number of 32-bit values in argv)
 *   [sp]     argv
 *   [sp,#4]  short signature
 *   [sp,#8]  func
 *   [sp,#12] pReturn
 *
 * For a virtual method call, the "this" reference is in argv[0].
 *
 * argInfo (32-bit int) layout:
 *   SRRRLLLL FFFFFFFF FFFFFFFF FFFFFFFF
 *
 *   S - if set, do things the hard way (scan the signature)
 *   R - return-type enumeration, really only important for "hard" FP ABI
 *   L - number of double-words of storage required on stack (0-30 words)
 *   F - pad flag -- if set, write a pad word to the stack
 *
 * With this arrangement we can efficiently push up to 24 words of arguments
 * onto the stack.  Anything requiring more than that -- which should happen
 * rarely to never -- can do the slow signature scan.
 *
 * (We could pack the Fs more efficiently -- we know we never push two pads
 * in a row, and the first word can never be a pad -- but there's really
 * no need for it.)
 *
 * NOTE: if the called function has more than 4 words of arguments, gdb
 * will not be able to unwind the stack past this method.  The only way
 * around this is to convince gdb to respect an explicit frame pointer.
 * The stack unwinder in debuggerd *does* pay attention to fp if we set it
 * up appropriately, so at least that will work.
 */
dvmPlatformInvoke:
    .fnstart

    /*
     * Save regs.
     *
     * On entry to a function, "sp" must be 64-bit aligned.  This means
     * we have to adjust sp manually if we push an odd number of regs here
     * (both here and when exiting).
     *
     * The ARM spec doesn't specify anything about the frame pointer.  gcc
     * points fp at the first saved argument, so our "full descending"
     * stack looks like:
     *
     *  pReturn
     *  func
     *  shorty
     *  argv        <-- sp on entry
     *  lr          <-- fp
     *  fp
     *  r9...r7
     *  r6          <-- sp after reg save
     *
     * Any arguments that need to be pushed on for the target method
     * come after this.  The last argument is pushed first.
     */
SAVED_REG_COUNT = 6                     @ push 6 regs
FP_STACK_OFFSET = (SAVED_REG_COUNT-1) * 4 @ offset between fp and post-save sp
FP_ADJ = 4                              @ fp is initial sp +4

    .save        {r6, r7, r8, r9, fp, lr}
    stmfd   sp!, {r6, r7, r8, r9, fp, lr}

    .setfp  fp, sp, #FP_STACK_OFFSET    @ point fp at first saved reg
    add     fp, sp, #FP_STACK_OFFSET

    @.pad    #4                          @ adjust for 64-bit align
    @sub     sp, sp, #4                  @ (if we save odd number of regs)

    @ Ensure 64-bit alignment.  EABI guarantees sp is aligned on entry, make
    @ sure we're aligned properly now.
DBG tst     sp, #4                      @ 64-bit aligned?
DBG bne     dvmAbort                    @ no, fail

    ldr     r9, [fp, #0+FP_ADJ]         @ r9<- argv
    cmp     r1, #0                      @ calling a static method?

    @ Not static, grab the "this" pointer.  Note "this" is not explicitly
    @ described by the method signature.
    subeq   r3, r3, #1                  @ argc--
    ldreq   r1, [r9], #4                @ r1<- *argv++

    @ Do we have arg padding flags in "argInfo"? (just need to check hi bit)
    teq     r2, #0
    bmi     .Lno_arg_info

    /*
     * "Fast" path.
     *
     * Make room on the stack for the arguments and copy them over,
     * inserting pad words when appropriate.
     *
     * Currently:
     *  r0  don't touch
     *  r1  don't touch
     *  r2  arg info
     *  r3  argc
     *  r4-r5  don't touch (not saved)
     *  r6-r8 (available)
     *  r9  argv
     *  fp  frame pointer
     */
.Lhave_arg_info:
    @ Expand the stack by the specified amount.  We want to extract the
    @ count of double-words from r2, multiply it by 8, and subtract that
    @ from the stack pointer.
    and     ip, r2, #0x0f000000         @ ip<- double-words required
    mov     r6, r2, lsr #28             @ r6<- return type
    sub     sp, sp, ip, lsr #21         @ shift right 24, then left 3
    mov     r8, sp                      @ r8<- sp  (arg copy dest)

    @ Stick argv in r7 and advance it past the argv values that will be
    @ held in r2-r3.  It's possible r3 will hold a pad, so check the
    @ bit in r2.  We do this by ignoring the first bit (which would
    @ indicate a pad in r2) and shifting the second into the carry flag.
    @ If the carry is set, r3 will hold a pad, so we adjust argv less.
    @
    @ (This is harmless if argc==0)
    mov     r7, r9
    movs    r2, r2, lsr #2
    addcc   r7, r7, #8                  @ skip past 2 words, for r2 and r3
    subcc   r3, r3, #2
    addcs   r7, r7, #4                  @ skip past 1 word, for r2
    subcs   r3, r3, #1

.Lfast_copy_loop:
    @ if (--argc < 0) goto invoke
    subs    r3, r3, #1
    bmi     .Lcopy_done                 @ NOTE: expects original argv in r9

.Lfast_copy_loop2:
    @ Get pad flag into carry bit.  If it's set, we don't pull a value
    @ out of argv.
    movs    r2, r2, lsr #1
    ldrcc   ip, [r7], #4                @ ip = *r7++ (pull from argv)
    strcc   ip, [r8], #4                @ *r8++ = ip (write to stack)
    bcc     .Lfast_copy_loop

DBG movcs   ip, #-3                     @ DEBUG DEBUG - make pad word obvious
DBG strcs   ip, [r8]                    @ DEBUG DEBUG
    add     r8, r8, #4                  @ if pad, just advance ip without store
    b       .Lfast_copy_loop2           @ don't adjust argc after writing pad


.Lcopy_done:
    /*
     * Currently:
     *  r0-r3  args (JNIEnv*, thisOrClass, arg0, arg1)
     *  r6  return type (enum DalvikJniReturnType)
     *  r9  original argv
     *  fp  frame pointer
     *
     * The stack copy is complete.  Grab the first two words off of argv
     * and tuck them into r2/r3.  If the first arg is 32-bit and the second
     * arg is 64-bit, then r3 "holds" a pad word and the load is unnecessary
     * but harmless.
     *
     * If there are 0 or 1 arg words in argv, we will be loading uninitialized
     * data into the registers, but since nothing tries to use it it's also
     * harmless (assuming argv[0] and argv[1] point to valid memory, which
     * is a reasonable assumption for Dalvik's interpreted stacks).
     */
    ldmia   r9, {r2-r3}                 @ r2/r3<- argv[0]/argv[1]

    ldr     ip, [fp, #8+FP_ADJ]         @ ip<- func
#ifdef __ARM_HAVE_BLX
    blx     ip                          @ call func
#else
    mov     lr, pc                      @ call func the old-fashioned way
    bx      ip
#endif

    @ We're back, result is in r0 or (for long/double) r0-r1.
    @
    @ In theory, we need to use the "return type" arg to figure out what
    @ we have and how to return it.  However, unless we have an FPU and
    @ "hard" fp calling conventions, all we need to do is copy r0-r1 into
    @ the JValue union.
    @
    @ Thought: could redefine DalvikJniReturnType such that single-word
    @ and double-word values occupy different ranges; simple comparison
    @ allows us to choose between str and stm.  Probably not worthwhile.
    @
    cmp     r6, #0                      @ DALVIK_JNI_RETURN_VOID?
    ldrne   ip, [fp, #12+FP_ADJ]        @ pReturn
    sub     sp, fp, #FP_STACK_OFFSET    @ restore sp to post-reg-save offset
    stmneia ip, {r0-r1}                 @ pReturn->j <- r0/r1

    @ Restore the registers we saved and return.  On >= ARMv5TE we can
    @ restore PC directly from the saved LR.
#ifdef __ARM_HAVE_PC_INTERWORK
    ldmfd   sp!, {r6, r7, r8, r9, fp, pc}
#else
    ldmfd   sp!, {r6, r7, r8, r9, fp, lr}
    bx      lr
#endif



    /*
     * "Slow" path.
     * Walk through the argument list, counting up the number of 32-bit words
     * required to contain it.  Then walk through it a second time, copying
     * values out to the stack.  (We could pre-compute the size to save
     * ourselves a trip, but we'd have to store that somewhere -- this is
     * sufficiently unlikely that it's not worthwhile.)
     *
     * Try not to make any assumptions about the number of args -- I think
     * the class file format allows up to 64K words (need to verify that).
     *
     * Currently:
     *  r0  don't touch
     *  r1  don't touch
     *  r2  (available)
     *  r3  argc
     *  r4-r5 don't touch (not saved)
     *  r6-r8 (available)
     *  r9  argv
     *  fp  frame pointer
     */
.Lno_arg_info:
    mov     ip, r2, lsr #28             @ ip<- return type
    ldr     r6, [fp, #4+FP_ADJ]         @ r6<- short signature
    add     r6, r6, #1                  @ advance past return type
    mov     r2, #0                      @ r2<- word count, init to zero

.Lcount_loop:
    ldrb    ip, [r6], #1                @ ip<- *signature++
    cmp     ip, #0                      @ end?
    beq     .Lcount_done                @ all done, bail
    add     r2, r2, #1                  @ count++
    cmp     ip, #'D'                    @ look for 'D' or 'J', which are 64-bit
    cmpne   ip, #'J'
    bne     .Lcount_loop

    @ 64-bit value, insert padding if we're not aligned
    tst     r2, #1                      @ odd after initial incr?
    addne   r2, #1                      @ no, add 1 more to cover 64 bits
    addeq   r2, #2                      @ yes, treat prev as pad, incr 2 now
    b       .Lcount_loop
.Lcount_done:

    @ We have the padded-out word count in r2.  We subtract 2 from it
    @ because we don't push the first two arg words on the stack (they're
    @ destined for r2/r3).  Pushing them on and popping them off would be
    @ simpler but slower.
    subs    r2, r2, #2                  @ subtract 2 (for contents of r2/r3)
    movmis  r2, #0                      @ if negative, peg at zero, set Z-flag
    beq     .Lcopy_done                 @ zero args, skip stack copy

DBG tst     sp, #7                      @ DEBUG - make sure sp is aligned now
DBG bne     dvmAbort                    @ DEBUG

    @ Set up to copy from r7 to r8.  We copy from the second arg to the
    @ last arg, which means reading and writing to ascending addresses.
    sub     sp, sp, r2, asl #2          @ sp<- sp - r2*4
    bic     sp, #4                      @ subtract another 4 ifn
    mov     r7, r9                      @ r7<- argv
    mov     r8, sp                      @ r8<- sp

    @ We need to copy words from [r7] to [r8].  We walk forward through
    @ the signature again, "copying" pad words when appropriate, storing
    @ upward into the stack.
    ldr     r6, [fp, #4+FP_ADJ]         @ r6<- signature
    add     r6, r6, #1                  @ advance past return type
    add     r7, r7, #8                  @ r7<- r7+8 (assume argv 0/1 in r2/r3)

    @ Eat first arg or two, for the stuff that goes into r2/r3.
    ldrb    ip, [r6], #1                @ ip<- *signature++
    cmp     ip, #'D'
    cmpne   ip, #'J'
    beq     .Lstack_copy_loop           @ 64-bit arg fills r2+r3

    @ First arg was 32-bit, check the next
    ldrb    ip, [r6], #1                @ ip<- *signature++
    cmp     ip, #'D'
    cmpne   ip, #'J'
    subeq   r7, #4                      @ r7<- r7-4 (take it back - pad word)
    beq     .Lstack_copy_loop2          @ start with char we already have

    @ Two 32-bit args, fall through and start with next arg

.Lstack_copy_loop:
    ldrb    ip, [r6], #1                @ ip<- *signature++
.Lstack_copy_loop2:
    cmp     ip, #0                      @ end of shorty?
    beq     .Lcopy_done                 @ yes

    cmp     ip, #'D'
    cmpne   ip, #'J'
    beq     .Lcopy64

    @ Copy a 32-bit value.  [r8] is initially at the end of the stack.  We
    @ use "full descending" stacks, so we store into [r8] and incr as we
    @ move toward the end of the arg list.
.Lcopy32:
    ldr     ip, [r7], #4
    str     ip, [r8], #4
    b       .Lstack_copy_loop

.Lcopy64:
    @ Copy a 64-bit value.  If necessary, leave a hole in the stack to
    @ ensure alignment.  We know the [r8] output area is 64-bit aligned,
    @ so we can just mask the address.
    add     r8, r8, #7          @ r8<- (r8+7) & ~7
    ldr     ip, [r7], #4
    bic     r8, r8, #7
    ldr     r2, [r7], #4
    str     ip, [r8], #4
    str     r2, [r8], #4
    b       .Lstack_copy_loop

    .fnend
    .size   dvmPlatformInvoke, .-dvmPlatformInvoke

#if 0

/*
 * Spit out a "we were here", preserving all registers.  (The attempt
 * to save ip won't work, but we need to save an even number of
 * registers for EABI 64-bit stack alignment.)
 */
     .macro SQUEAK num
common_squeak\num:
    stmfd   sp!, {r0, r1, r2, r3, ip, lr}
    ldr     r0, strSqueak
    mov     r1, #\num
    bl      printf
#ifdef __ARM_HAVE_PC_INTERWORK
    ldmfd   sp!, {r0, r1, r2, r3, ip, pc}
#else
    ldmfd   sp!, {r0, r1, r2, r3, ip, lr}
    bx      lr
#endif
    .endm

    SQUEAK  0
    SQUEAK  1
    SQUEAK  2
    SQUEAK  3
    SQUEAK  4
    SQUEAK  5

strSqueak:
    .word   .LstrSqueak
.LstrSqueak:
    .asciz  "<%d>"

    .align  2

#endif

#endif /*__ARM_EABI__*/