@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@******************************************************************************
@* @file
@*  ih264_inter_pred_luma_bilinear_a9q.s
@*
@* @brief
@*  Contains function definitions for inter prediction  interpolation.
@*
@* @author
@* Ittiam
@*
@* @par List of Functions:
@*
@*  - ih264_inter_pred_luma_bilinear_a9q()
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*

@* All the functions here are replicated from ih264_inter_pred_filters.c
@

@**
@**
@**
@ *******************************************************************************
@ *  function:ih264_inter_pred_luma_bilinear
@ *
@* @brief
@*    This routine applies the bilinear filter to the predictors .
@*    The  filtering operation is described in
@*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
@*
@* @par Description:
@\note
@*     This function is called to obtain pixels lying at the following
@*    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
@*    The function averages the two adjacent values from the two input arrays in horizontal direction.
@*
@*
@* @param[in] pu1_src1:
@*  UWORD8 Pointer to the buffer containing the first input array.
@*
@* @param[in] pu1_src2:
@*  UWORD8 Pointer to the buffer containing the second input array.
@*
@* @param[out] pu1_dst
@*  UWORD8 pointer to the destination where the output of bilinear filter is stored.
@*
@* @param[in] src_strd1
@*  Stride of the first input buffer
@*
@* @param[in] src_strd2
@*  Stride of the second input buffer
@*
@* @param[in] dst_strd
@*  integer destination stride of pu1_dst
@*
@* @param[in] ht
@*  integer height of the array
@*
@* @param[in] wd
@*  integer width of the array
@*
@* @returns
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*

@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
@                                   UWORD8 *pu1_src2,
@                                   UWORD8 *pu1_dst,
@                                   WORD32 src_strd1,
@                                   WORD32 src_strd2,
@                                   WORD32 dst_strd,
@                                   WORD32 height,
@                                   WORD32 width)
@
@**************Variables Vs Registers*****************************************
@   r0 => *pu1_src1
@   r1 => *pu1_src2
@   r2 => *pu1_dst
@   r3 =>  src_strd1
@   r4 =>  src_strd2
@   r5 =>  dst_strd
@   r6 =>  height
@   r7 => width
@
.text
.p2align 2

    .global ih264_inter_pred_luma_bilinear_a9q

ih264_inter_pred_luma_bilinear_a9q:



    stmfd         sp!, {r4-r12, r14}    @store register values to stack
    vstmdb        sp!, {d8-d15}         @push neon registers to stack
    ldr           r4, [sp, #104]
    ldr           r5, [sp, #108]        @
    ldr           r6, [sp, #112]
    ldr           r7, [sp, #116]

    subs          r12, r7, #4           @if wd=4 branch to loop_4
    beq           loop_4
    subs          r12, r7, #8           @if wd=8 branch to loop_8
    beq           loop_8

loop_16:                                @when  wd=16

    vld1.8        {q0}, [r0], r3        @// Load row0 ;src1
    vld1.8        {q2}, [r1], r4        @// Load row0  ;src2
    vld1.8        {q1}, [r0], r3        @// Load row1 ;src1
    vaddl.u8      q10, d0, d4
    vld1.8        {q3}, [r1], r4        @// Load row1  ;src2
    vaddl.u8      q11, d1, d5
    vld1.8        {q4}, [r0], r3        @// Load row2 ;src1
    vaddl.u8      q12, d2, d6
    vld1.8        {q5}, [r0], r3        @// Load row3 ;src1
    vaddl.u8      q13, d3, d7
    vld1.8        {q6}, [r1], r4        @// Load row2  ;src2
    vaddl.u8      q8, d8, d12
    vld1.8        {q7}, [r1], r4        @// Load row3  ;src2
    vaddl.u8      q9, d9, d13
    vqrshrun.s16  d28, q10, #1
    vqrshrun.s16  d29, q11, #1
    vaddl.u8      q10, d10, d14
    vqrshrun.s16  d30, q12, #1
    vqrshrun.s16  d31, q13, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row0
    vaddl.u8      q11, d11, d15
    vst1.8        {q15}, [r2], r5       @//Store dest row1
    vqrshrun.s16  d28, q8, #1
    vld1.8        {q0}, [r0], r3        @// Load row4 ;src1
    vqrshrun.s16  d29, q9, #1
    vld1.8        {q1}, [r0], r3        @// Load row5 ;src1
    vqrshrun.s16  d30, q10, #1
    vld1.8        {q2}, [r1], r4        @// Load row4  ;src2
    vqrshrun.s16  d31, q11, #1
    vld1.8        {q3}, [r1], r4        @// Load row5  ;src2
    vaddl.u8      q10, d0, d4
    vst1.8        {q14}, [r2], r5       @//Store dest row2
    vaddl.u8      q13, d3, d7
    vst1.8        {q15}, [r2], r5       @//Store dest row3
    vaddl.u8      q11, d1, d5
    vld1.8        {q4}, [r0], r3        @// Load row6 ;src1
    vaddl.u8      q12, d2, d6
    vld1.8        {q5}, [r0], r3        @// Load row7 ;src1
    vqrshrun.s16  d28, q10, #1
    vld1.8        {q6}, [r1], r4        @// Load row6  ;src2
    vqrshrun.s16  d29, q11, #1
    vld1.8        {q7}, [r1], r4        @// Load row7  ;src2
    vaddl.u8      q8, d8, d12
    vaddl.u8      q9, d9, d13
    vaddl.u8      q10, d10, d14
    vqrshrun.s16  d30, q12, #1
    vqrshrun.s16  d31, q13, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row4
    vaddl.u8      q11, d11, d15
    vst1.8        {q15}, [r2], r5       @//Store dest row5
    vqrshrun.s16  d28, q8, #1
    vqrshrun.s16  d30, q10, #1
    vqrshrun.s16  d29, q9, #1
    vld1.8        {q2}, [r1], r4        @// Load row8  ;src2
    vqrshrun.s16  d31, q11, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row6
    subs          r12, r6, #8
    vst1.8        {q15}, [r2], r5       @//Store dest row7

    beq           end_func              @ end function if ht=8

    vld1.8        {q0}, [r0], r3        @// Load row8 ;src1
    vaddl.u8      q10, d0, d4
    vld1.8        {q1}, [r0], r3        @// Load row9 ;src1
    vaddl.u8      q11, d1, d5
    vld1.8        {q3}, [r1], r4        @// Load row9  ;src2
    vqrshrun.s16  d28, q10, #1
    vld1.8        {q4}, [r0], r3        @// Load row10 ;src1
    vqrshrun.s16  d29, q11, #1
    vld1.8        {q5}, [r0], r3        @// Load row11 ;src1
    vaddl.u8      q12, d2, d6
    vld1.8        {q6}, [r1], r4        @// Load row10  ;src2
    vaddl.u8      q13, d3, d7
    vld1.8        {q7}, [r1], r4        @// Load row11 ;src2
    vaddl.u8      q8, d8, d12
    vaddl.u8      q9, d9, d13
    vaddl.u8      q10, d10, d14
    vqrshrun.s16  d30, q12, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row8
    vqrshrun.s16  d31, q13, #1
    vst1.8        {q15}, [r2], r5       @//Store dest row9
    vqrshrun.s16  d28, q8, #1
    vld1.8        {q0}, [r0], r3        @// Load row12 ;src1
    vaddl.u8      q11, d11, d15
    vld1.8        {q1}, [r0], r3        @// Load row13 ;src1
    vqrshrun.s16  d29, q9, #1
    vld1.8        {q2}, [r1], r4        @// Load row12  ;src2
    vqrshrun.s16  d30, q10, #1
    vld1.8        {q3}, [r1], r4        @// Load row13  ;src2
    vqrshrun.s16  d31, q11, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row10
    vaddl.u8      q10, d0, d4
    vst1.8        {q15}, [r2], r5       @//Store dest row11
    vaddl.u8      q11, d1, d5
    vld1.8        {q4}, [r0], r3        @// Load row14 ;src1
    vaddl.u8      q13, d3, d7
    vld1.8        {q5}, [r0], r3        @// Load row15 ;src1
    vaddl.u8      q12, d2, d6
    vld1.8        {q6}, [r1], r4        @// Load row14  ;src2
    vaddl.u8      q8, d8, d12
    vld1.8        {q7}, [r1], r4        @// Load row15  ;src2
    vaddl.u8      q9, d9, d13
    vqrshrun.s16  d28, q10, #1
    vqrshrun.s16  d29, q11, #1
    vaddl.u8      q10, d10, d14
    vst1.8        {q14}, [r2], r5       @//Store dest row12
    vqrshrun.s16  d30, q12, #1
    vqrshrun.s16  d31, q13, #1
    vaddl.u8      q11, d11, d15
    vst1.8        {q15}, [r2], r5       @//Store dest row13
    vqrshrun.s16  d28, q8, #1
    vqrshrun.s16  d29, q9, #1
    vqrshrun.s16  d30, q10, #1
    vst1.8        {q14}, [r2], r5       @//Store dest row14
    vqrshrun.s16  d31, q11, #1
    vst1.8        {q15}, [r2], r5       @//Store dest row15
    b             end_func



loop_8: @wd=8;
    vld1.8        {d0}, [r0], r3        @// Load row0 ;src1
    vld1.8        {d4}, [r1], r4        @// Load row0  ;src2
    vld1.8        {d1}, [r0], r3        @// Load row1 ;src1
    vaddl.u8      q10, d0, d4
    vld1.8        {d5}, [r1], r4        @// Load row1  ;src2
    vld1.8        {d2}, [r0], r3        @// Load row2 ;src1
    vqrshrun.s16  d28, q10, #1
    vld1.8        {d6}, [r1], r4        @// Load row2  ;src2
    vaddl.u8      q11, d1, d5
    vld1.8        {d3}, [r0], r3        @// Load row3 ;src1
    vaddl.u8      q12, d2, d6
    vst1.8        {d28}, [r2], r5       @//Store dest row0
    vqrshrun.s16  d29, q11, #1
    vld1.8        {d7}, [r1], r4        @// Load row3  ;src2
    vqrshrun.s16  d30, q12, #1
    vst1.8        {d29}, [r2], r5       @//Store dest row1
    vaddl.u8      q13, d3, d7
    vst1.8        {d30}, [r2], r5       @//Store dest row2
    vqrshrun.s16  d31, q13, #1
    subs          r12, r6, #4
    vst1.8        {d31}, [r2], r5       @//Store dest row3
    beq           end_func              @ end function if ht=4

    vld1.8        {d12}, [r1], r4       @// Load row4 ;src2
    vld1.8        {d8}, [r0], r3        @// Load row4 ;src1
    vld1.8        {d9}, [r0], r3        @// Load row5 ;src1
    vaddl.u8      q8, d8, d12
    vld1.8        {d13}, [r1], r4       @// Load row5  ;src2
    vld1.8        {d10}, [r0], r3       @// Load row6;src1
    vaddl.u8      q9, d9, d13
    vld1.8        {d14}, [r1], r4       @// Load row6  ;src2
    vqrshrun.s16  d28, q8, #1
    vld1.8        {d11}, [r0], r3       @// Load row7 ;src1
    vqrshrun.s16  d29, q9, #1
    vst1.8        {d28}, [r2], r5       @//Store dest row4
    vaddl.u8      q10, d10, d14
    vst1.8        {d29}, [r2], r5       @//Store dest row5
    vqrshrun.s16  d30, q10, #1
    vld1.8        {d15}, [r1], r4       @// Load row7 ;src2
    vaddl.u8      q11, d11, d15
    vst1.8        {d30}, [r2], r5       @//Store dest row6
    vqrshrun.s16  d31, q11, #1
    subs          r12, r6, #8
    vst1.8        {d31}, [r2], r5       @//Store dest row7
    beq           end_func              @ end function if ht=8

    vld1.8        {d0}, [r0], r3        @// Load row8 ;src1
    vld1.8        {d4}, [r1], r4        @// Load row8  ;src2
    vld1.8        {d1}, [r0], r3        @// Load row9 ;src1
    vaddl.u8      q10, d0, d4
    vld1.8        {d5}, [r1], r4        @// Load row9  ;src2
    vld1.8        {d2}, [r0], r3        @// Load row10 ;src1
    vaddl.u8      q11, d1, d5
    vld1.8        {d6}, [r1], r4        @// Load row10  ;src2
    vqrshrun.s16  d28, q10, #1
    vld1.8        {d3}, [r0], r3        @// Load row11 ;src1
    vaddl.u8      q12, d2, d6
    vld1.8        {d7}, [r1], r4        @// Load row11  ;src2
    vqrshrun.s16  d29, q11, #1
    vld1.8        {d8}, [r0], r3        @// Load row12 ;src1
    vaddl.u8      q13, d3, d7
    vst1.8        {d28}, [r2], r5       @//Store dest row8
    vqrshrun.s16  d30, q12, #1
    vld1.8        {d12}, [r1], r4       @// Load row12  ;src2
    vqrshrun.s16  d31, q13, #1
    vst1.8        {d29}, [r2], r5       @//Store dest row9
    vaddl.u8      q8, d8, d12
    vld1.8        {d9}, [r0], r3        @// Load row13 ;src1
    vqrshrun.s16  d28, q8, #1
    vld1.8        {d13}, [r1], r4       @// Load row13  ;src2
    vld1.8        {d10}, [r0], r3       @// Load row14;src1
    vaddl.u8      q9, d9, d13
    vld1.8        {d11}, [r0], r3       @// Load row15 ;src1
    vld1.8        {d14}, [r1], r4       @// Load row14  ;src2
    vqrshrun.s16  d29, q9, #1
    vld1.8        {d15}, [r1], r4       @// Load roW15 ;src2
    vaddl.u8      q10, d10, d14
    vst1.8        {d30}, [r2], r5       @//Store dest row10
    vaddl.u8      q11, d11, d15
    vst1.8        {d31}, [r2], r5       @//Store dest row11
    vqrshrun.s16  d30, q10, #1
    vst1.8        {d28}, [r2], r5       @//Store dest row12
    vqrshrun.s16  d31, q11, #1
    vst1.8        {d29}, [r2], r5       @//Store dest row13
    vst1.8        {d30}, [r2], r5       @//Store dest row14
    vst1.8        {d31}, [r2], r5       @//Store dest row15

    b             end_func



loop_4:
    vld1.32       d0[0], [r0], r3       @// Load row0 ;src1
    vld1.32       d4[0], [r1], r4       @// Load row0  ;src2
    vld1.32       d1[0], [r0], r3       @// Load row1 ;src1
    vaddl.u8      q10, d0, d4
    vld1.32       d5[0], [r1], r4       @// Load row1  ;src2
    vld1.32       d2[0], [r0], r3       @// Load row2 ;src1
    vqrshrun.s16  d28, q10, #1
    vld1.32       d6[0], [r1], r4       @// Load row2  ;src2
    vaddl.u8      q11, d1, d5
    vld1.32       d3[0], [r0], r3       @// Load row3 ;src1
    vaddl.u8      q12, d2, d6
    vst1.32       d28[0], [r2], r5      @//Store dest row0
    vqrshrun.s16  d29, q11, #1
    vld1.32       d7[0], [r1], r4       @// Load row3  ;src2
    vqrshrun.s16  d30, q12, #1
    vst1.32       d29[0], [r2], r5      @//Store dest row1
    vaddl.u8      q13, d3, d7
    vst1.32       d30[0], [r2], r5      @//Store dest row2
    vqrshrun.s16  d31, q13, #1
    subs          r12, r6, #4
    vst1.32       d31[0], [r2], r5      @//Store dest row3
    beq           end_func              @ end function if ht=4

    vld1.32       d12[0], [r1], r4      @// Load row4 ;src2
    vld1.32       d8[0], [r0], r3       @// Load row4 ;src1
    vld1.32       d9[0], [r0], r3       @// Load row5 ;src1
    vaddl.u8      q8, d8, d12
    vld1.32       d13[0], [r1], r4      @// Load row5  ;src2
    vld1.32       d10[0], [r0], r3      @// Load row6;src1
    vaddl.u8      q9, d9, d13
    vld1.32       d14[0], [r1], r4      @// Load row6  ;src2
    vqrshrun.s16  d28, q8, #1
    vld1.32       d11[0], [r0], r3      @// Load row7 ;src1
    vqrshrun.s16  d29, q9, #1
    vst1.32       d28[0], [r2], r5      @//Store dest row4
    vaddl.u8      q10, d10, d14
    vst1.32       d29[0], [r2], r5      @//Store dest row5
    vqrshrun.s16  d30, q10, #1
    vld1.32       d15[0], [r1], r4      @// Load row7 ;src2
    vaddl.u8      q11, d11, d15
    vst1.32       d30[0], [r2], r5      @//Store dest row6
    vqrshrun.s16  d31, q11, #1
    vst1.32       d31[0], [r2], r5      @//Store dest row7

end_func:

    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack