///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///**
//*******************************************************************************
//*
//* //brief
//*     interprediction luma function for copy
//*
//* //par description:
//*   copies the array of width 'wd' and height 'ht' from the  location pointed
//*   by 'src' to the location pointed by 'dst'
//*
//* //param[in] pu1_src
//*  uword8 pointer to the source
//*
//* //param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* //param[in] src_strd
//*  integer source stride
//*
//* //param[in] dst_strd
//*  integer destination stride
//*
//* //param[in] pi1_coeff
//*  word8 pointer to the filter coefficients
//*
//* //param[in] ht
//*  integer height of the array
//*
//* //param[in] wd
//*  integer width of the array
//*
//* //returns
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_luma_copy (
//                            uword8 *pu1_src,
//                            uword8 *pu1_dst,
//                            word32 src_strd,
//                            word32 dst_strd,
//                            word8 *pi1_coeff,
//                            word32 ht,
//                            word32 wd   )

//**************variables vs registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//    x11 =>  ht
//    x16 => wd

.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_inter_pred_luma_copy_av8

.type ihevc_inter_pred_luma_copy_av8, %function

ihevc_inter_pred_luma_copy_av8:
    // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
    stp         x19,x20,[sp, #-16]!
    mov         x16,x6                      //loads wd
    mov         x11,x5                      //loads ht
    cmp         x11,#0                      //checks ht == 0
    ble         end_loops
    tst         x16,#15                     //checks wd for multiples for 4 & 8
    beq         core_loop_wd_16
    tst         x16,#7                      //checks wd for multiples for 4 & 8
    beq         core_loop_wd_8
    sub         x15,x16,#4

outer_loop_wd_4:
    subs        x8,x16,#0                   //checks wd == 0
    ble         end_inner_loop_wd_4

inner_loop_wd_4:
    ld1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         x9,x0,x2                    //pu1_src_tmp += src_strd
    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         x0,x0,#4                    //pu1_src += 4
    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    subs        x8,x8,#4                    //(wd -4)
    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         x1,x1,#4                    //pu1_dst += 4
    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

    bgt         inner_loop_wd_4

end_inner_loop_wd_4:
    subs        x11,x11,#4                  //ht - 4
    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_4

end_loops:
    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
//  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
    ret


core_loop_wd_8:
    sub         x15,x16,#8

outer_loop_wd_8:
    subs        x8,x16,#0                   //checks wd
    ble         end_inner_loop_wd_8

inner_loop_wd_8:
    add         x9,x0,x2                    //pu1_src_tmp += src_strd
    ld1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    st1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1         {v1.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    st1         {v1.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    subs        x8,x8,#8                    //wd - 8(loop condition)
    ld1         {v2.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    st1         {v2.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1         {v3.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    st1         {v3.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    bgt         inner_loop_wd_8

end_inner_loop_wd_8:
    subs        x11,x11,#4                  //ht -= 4
    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_8

    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
//  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
    ret

core_loop_wd_16:
    sub         x15,x16,#16

outer_loop_wd_16:
    subs        x8,x16,#0                   //checks wd
    ble         end_inner_loop_wd_16

inner_loop_wd_16:
    add         x9,x0,x2                    //pu1_src_tmp += src_strd
    ld1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    st1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1         {v1.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    st1         {v1.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    subs        x8,x8,#16                   //wd - 8(loop condition)
    ld1         {v2.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    st1         {v2.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1         {v3.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    st1         {v3.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    bgt         inner_loop_wd_16

end_inner_loop_wd_16:
    subs        x11,x11,#4                  //ht -= 4
    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_16

    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
//  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
    ret