//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
///**
//*******************************************************************************
//*
//* @brief
//*     Interprediction luma function for copy
//*
//* @par Description:
//*   Copies the array of width 'wd' and height 'ht' from the  location pointed
//*   by 'src' to the location pointed by 'dst'
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//void ih264_inter_pred_luma_copy (
//                            UWORD8 *pu1_src,
//                            UWORD8 *pu1_dst,
//                            WORD32 src_strd,
//                            WORD32 dst_strd,
//                            WORD32 ht,
//                            WORD32 wd   )

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    w2 =>  src_strd
//    w3 =>  dst_strd
//    w4 =>  ht
//    w5 =>  wd

.text
.p2align 2
.include "ih264_neon_macros.s"



    .global ih264_inter_pred_luma_copy_av8

ih264_inter_pred_luma_copy_av8:

    push_v_regs
    stp       x19, x20, [sp, #-16]!
    sxtw      x2, w2
    sxtw      x3, w3
    sxtw      x4, w4
    sxtw      x5, w5

    mov       x12, x5
    mov       x7, x4
    cmp       x7, #0                    //checks ht == 0
    ble       end_loops
    tst       x12, #15                  //checks wd for multiples for 4 & 8
    beq       core_loop_wd_16
    tst       x12, #7                   //checks wd for multiples for 4 & 8
    beq       core_loop_wd_8
    sub       x11, x12, #4

outer_loop_wd_4:
    subs      x4, x12, #0               //checks wd == 0
    ble       end_inner_loop_wd_4

inner_loop_wd_4:
    ld1       {v0.s}[0], [x0]           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add       x5, x0, x2                //pu1_src_tmp += src_strd
    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
    st1       {v0.s}[0], [x1]           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add       x0, x0, #4                //pu1_src += 4
    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    subs      x4, x4, #4                //(wd -4)
    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add       x1, x1, #4                //pu1_dst += 4
    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

    bgt       inner_loop_wd_4

end_inner_loop_wd_4:
    subs      x7, x7, #4                //ht - 4
    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
    bgt       outer_loop_wd_4

end_loops:
    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


core_loop_wd_8:
    sub       x11, x12, #8

outer_loop_wd_8:
    subs      x4, x12, #0               //checks wd
    ble       end_inner_loop_wd_8

inner_loop_wd_8:
    add       x5, x0, x2                //pu1_src_tmp += src_strd
    ld1       {v0.8b}, [x0], #8         //vld1_u8(pu1_src_tmp)
    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
    st1       {v0.8b}, [x1], #8         //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1       {v1.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
    st1       {v1.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
    subs      x4, x4, #8                //wd - 8(Loop condition)
    ld1       {v2.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
    st1       {v2.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1       {v3.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
    st1       {v3.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
    bgt       inner_loop_wd_8

end_inner_loop_wd_8:
    subs      x7, x7, #4                //ht -= 4
    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
    bgt       outer_loop_wd_8

    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret

core_loop_wd_16:
    sub       x11, x12, #16

outer_loop_wd_16:
    subs      x4, x12, #0               //checks wd
    ble       end_inner_loop_wd_16

inner_loop_wd_16:
    add       x5, x0, x2                //pu1_src_tmp += src_strd
    ld1       { v0.16b}, [x0], #16      //vld1_u8(pu1_src_tmp)
    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
    st1       { v0.16b}, [x1], #16      //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1       { v2.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
    st1       { v2.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
    subs      x4, x4, #16               //wd - 8(Loop condition)
    ld1       { v4.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
    st1       { v4.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
    ld1       { v6.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
    st1       { v6.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
    bgt       inner_loop_wd_16

end_inner_loop_wd_16:
    subs      x7, x7, #4                //ht -= 4
    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
    bgt       outer_loop_wd_16


    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


// /*
// ********************************************************************************
// *
// * @brief This function copies a 4x4 block to destination
// *
// * @par Description:
// * Copies a 4x4 block to destination, where both src and dst are interleaved
// *
// * @param[in] pi2_src
// *  Source
// *
// * @param[in] pu1_out
// *  Output pointer
// *
// * @param[in] pred_strd,
// *  Prediction buffer stride
// *
// * @param[in] out_strd
// *  output buffer buffer Stride
// *
// * @returns none
// *
// * @remarks none
// * Currently wd and height is not used, ie a 4x4 block is always copied
// *
// *******************************************************************************
// */
// void ih264_interleave_copy(WORD16 *pi2_src,
//                            UWORD8 *pu1_out,
//                            WORD32 pred_strd,
//                            WORD32 out_strd
//                            WORD32 wd
//                            WORD32 ht)
// Register Usage
// x0 : pi2_src
// x1 : pu1_out
// w2 : src_strd
// w3 : out_strd
// Neon registers d0-d7, d16-d30 are used
// No need for pushing  arm and neon registers

    .global ih264_interleave_copy_av8
ih264_interleave_copy_av8:
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
    ld1       {v3.8b}, [x0], x2
    mov       v2.d[1], v3.d[0]
    ld1       {v4.8b}, [x0], x2
    ld1       {v5.8b}, [x0], x2
    mov       v4.d[1], v5.d[0]

    mov       x0, x1

    ld1       {v18.8b}, [x1], x3        //load out [8 bit size) -8 coeffs
    ld1       {v19.8b}, [x1], x3
    mov       v18.d[1], v19.d[0]
    movi      v30.8h, #0x00ff
    ld1       {v20.8b}, [x1], x3
    ld1       {v21.8b}, [x1], x3
    mov       v20.d[1], v21.d[0]

    bit       v18.16b, v2.16b , v30.16b
    bit       v20.16b, v4.16b , v30.16b

    st1       {v18.8b}, [x0], x3        //store  out
    st1       {v18.d}[1], [x0], x3
    st1       {v20.8b}, [x0], x3
    st1       {v20.d}[1], [x0], x3

    pop_v_regs
    ret