@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_inter_pred_chroma_copy_w16out_neon.s
@*
@* @brief
@*  contains function definitions for inter prediction  interpolation.
@* functions are coded using neon  intrinsics and can be compiled using

@* rvct
@*
@* @author
@*  yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*   chroma interprediction filter for copy
@*
@* @par description:
@*    copies the array of width 'wd' and height 'ht' from the  location pointed
@*    by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] pi1_coeff
@*  word8 pointer to the filter coefficients
@*
@* @param[in] ht
@*  integer height of the array
@*
@* @param[in] wd
@*  integer width of the array
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
@                                           word16 *pi2_dst,
@                                           word32 src_strd,
@                                           word32 dst_strd,
@                                           word8 *pi1_coeff,
@                                           word32 ht,
@                                           word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 =>  src_strd
@r3 =>  dst_strd
@r4 => *pi1_coeff
@r5 =>  ht
@r6 =>  wd

.text
.align 4




.globl ihevc_inter_pred_chroma_copy_w16out_a9q

.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function

ihevc_inter_pred_chroma_copy_w16out_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    ldr         r12,[sp,#48]                @loads wd
    lsl         r12,r12,#1                  @2*wd
    ldr         r7,[sp,#44]                 @loads ht
    cmp         r7,#0                       @ht condition(ht == 0)
    ble         end_loops                   @loop
    and         r8,r7,#3                    @check ht for mul of 2
    sub         r9,r7,r8                    @check the rounded height value
    and         r11,r7,#6
    cmp         r11,#6
    beq         loop_ht_6
    tst         r12,#7                      @conditional check for wd (multiples)
    beq         core_loop_wd_8

loop_ht_6:
    sub         r11,r12,#4
    lsls        r6,r3,#1
    cmp         r9,#0
    beq         outer_loop_wd_4_ht_2

outer_loop_wd_4:
    subs        r4,r12,#0                   @wd conditional subtract
    ble         end_inner_loop_wd_4

inner_loop_wd_4:
    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    add         r5,r0,r2                    @pu1_src +src_strd
    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    add         r10,r1,r6
    subs        r4,r4,#4                    @wd - 4
    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    add         r0,r0,#4                    @pu1_src += 4
    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    add         r1,r1,#8
    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    bgt         inner_loop_wd_4

end_inner_loop_wd_4:
    subs        r9,r9,#4                    @ht - 4
    sub         r0,r5,r11
    sub         r1,r10,r11,lsl #1
    bgt         outer_loop_wd_4
    cmp         r8,#0
    bgt         outer_loop_wd_4_ht_2


end_loops:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp


outer_loop_wd_4_ht_2:
    subs        r4,r12,#0                   @wd conditional subtract
    ble         end_inner_loop_wd_4

inner_loop_wd_4_ht_2:
    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    add         r5,r0,r2                    @pu1_src +src_strd
    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    add         r10,r1,r6
    subs        r4,r4,#4                    @wd - 4
    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    add         r0,r0,#4                    @pu1_src += 4
    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    add         r1,r1,#8
    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    bgt         inner_loop_wd_4_ht_2
    b           end_loops


core_loop_wd_8:
    @sub            r11,r12,#8
    lsls        r5,r3,#1
    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
    rsb         r8,r12,r2,lsl #2            @r2->src_strd
    mov         r4,r12, lsr #3              @ divide by 8
    mov         r7,r9
    mul         r7, r4
    sub         r4,r12,#0                   @wd conditional check
    sub         r7,r7,#4                    @subtract one for epilog
    cmp         r9,#0
    beq         core_loop_wd_8_ht_2

prolog:
    add         r6,r0,r2                    @pu1_src_tmp += src_strd
    add         r10,r1,r5
    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    subs        r4,r4,#8                    @wd decrements by 8
    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    addle       r0,r0,r8
    add         r6,r0,r2                    @pu1_src_tmp += src_strd
    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)

    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    addle       r1,r1,r11,lsl #1
    suble       r4,r12,#0                   @wd conditional check

    subs        r7,r7,#4                    @ht - 4

    blt         epilog_end                  @jumps to epilog_end
    beq         epilog                      @jumps to epilog



outer_loop_wd_8:

    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))

    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)

    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)

    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)

    subs        r4,r4,#8                    @wd decrements by 8
    addle       r0,r0,r8

    add         r6,r0,r2                    @pu1_src_tmp += src_strd

    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)

    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)

    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)

    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    add         r10,r1,r5

    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)

    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)

    addle       r1,r1,r11,lsl #1
    suble       r4,r12,#0                   @wd conditional check

    subs        r7,r7,#4                    @ht - 4
    bgt         outer_loop_wd_8

epilog:
    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))

    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)

    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)

    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    @add        r6,r0,r2                @pu1_src_tmp += src_strd

    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    add         r10,r1,r5
    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)

    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
epilog_end:
    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    b           end_loops

core_loop_wd_8_ht_2:
    add         r6,r0,r2                    @pu1_src_tmp += src_strd
    add         r10,r1,r5
    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    subs        r12,r12,#8                  @wd decrements by 8
    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    bgt         core_loop_wd_8_ht_2

    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp