@/****************************************************************************** @ * @ * Copyright (C) 2015 The Android Open Source Project @ * @ * Licensed under the Apache License, Version 2.0 (the "License"); @ * you may not use this file except in compliance with the License. @ * You may obtain a copy of the License at: @ * @ * http://www.apache.org/licenses/LICENSE-2.0 @ * @ * Unless required by applicable law or agreed to in writing, software @ * distributed under the License is distributed on an "AS IS" BASIS, @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ * See the License for the specific language governing permissions and @ * limitations under the License. @ * @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ @** @****************************************************************************** @* @file @* ih264_inter_pred_luma_bilinear_a9q.s @* @* @brief @* Contains function definitions for inter prediction interpolation. @* @* @author @* Ittiam @* @* @par List of Functions: @* @* - ih264_inter_pred_luma_bilinear_a9q() @* @* @remarks @* None @* @******************************************************************************* @* @* All the functions here are replicated from ih264_inter_pred_filters.c @ @** @** @** @ ******************************************************************************* @ * function:ih264_inter_pred_luma_bilinear @ * @* @brief @* This routine applies the bilinear filter to the predictors . @* The filtering operation is described in @* sec 8.4.2.2.1 titled "Luma sample interpolation process" @* @* @par Description: @\note @* This function is called to obtain pixels lying at the following @* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . @* The function averages the two adjacent values from the two input arrays in horizontal direction. @* @* @* @param[in] pu1_src1: @* UWORD8 Pointer to the buffer containing the first input array. @* @* @param[in] pu1_src2: @* UWORD8 Pointer to the buffer containing the second input array. @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination where the output of bilinear filter is stored. @* @* @param[in] src_strd1 @* Stride of the first input buffer @* @* @param[in] src_strd2 @* Stride of the second input buffer @* @* @param[in] dst_strd @* integer destination stride of pu1_dst @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* None @* @******************************************************************************* @* @void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, @ WORD32 src_strd1, @ WORD32 src_strd2, @ WORD32 dst_strd, @ WORD32 height, @ WORD32 width) @ @**************Variables Vs Registers***************************************** @ r0 => *pu1_src1 @ r1 => *pu1_src2 @ r2 => *pu1_dst @ r3 => src_strd1 @ r4 => src_strd2 @ r5 => dst_strd @ r6 => height @ r7 => width @ .text .p2align 2 .global ih264_inter_pred_luma_bilinear_a9q ih264_inter_pred_luma_bilinear_a9q: stmfd sp!, {r4-r12, r14} @store register values to stack vstmdb sp!, {d8-d15} @push neon registers to stack ldr r4, [sp, #104] ldr r5, [sp, #108] @ ldr r6, [sp, #112] ldr r7, [sp, #116] subs r12, r7, #4 @if wd=4 branch to loop_4 beq loop_4 subs r12, r7, #8 @if wd=8 branch to loop_8 beq loop_8 loop_16: @when wd=16 vld1.8 {q0}, [r0], r3 @// Load row0 ;src1 vld1.8 {q2}, [r1], r4 @// Load row0 ;src2 vld1.8 {q1}, [r0], r3 @// Load row1 ;src1 vaddl.u8 q10, d0, d4 vld1.8 {q3}, [r1], r4 @// Load row1 ;src2 vaddl.u8 q11, d1, d5 vld1.8 {q4}, [r0], r3 @// Load row2 ;src1 vaddl.u8 q12, d2, d6 vld1.8 {q5}, [r0], r3 @// Load row3 ;src1 vaddl.u8 q13, d3, d7 vld1.8 {q6}, [r1], r4 @// Load row2 ;src2 vaddl.u8 q8, d8, d12 vld1.8 {q7}, [r1], r4 @// Load row3 ;src2 vaddl.u8 q9, d9, d13 vqrshrun.s16 d28, q10, #1 vqrshrun.s16 d29, q11, #1 vaddl.u8 q10, d10, d14 vqrshrun.s16 d30, q12, #1 vqrshrun.s16 d31, q13, #1 vst1.8 {q14}, [r2], r5 @//Store dest row0 vaddl.u8 q11, d11, d15 vst1.8 {q15}, [r2], r5 @//Store dest row1 vqrshrun.s16 d28, q8, #1 vld1.8 {q0}, [r0], r3 @// Load row4 ;src1 vqrshrun.s16 d29, q9, #1 vld1.8 {q1}, [r0], r3 @// Load row5 ;src1 vqrshrun.s16 d30, q10, #1 vld1.8 {q2}, [r1], r4 @// Load row4 ;src2 vqrshrun.s16 d31, q11, #1 vld1.8 {q3}, [r1], r4 @// Load row5 ;src2 vaddl.u8 q10, d0, d4 vst1.8 {q14}, [r2], r5 @//Store dest row2 vaddl.u8 q13, d3, d7 vst1.8 {q15}, [r2], r5 @//Store dest row3 vaddl.u8 q11, d1, d5 vld1.8 {q4}, [r0], r3 @// Load row6 ;src1 vaddl.u8 q12, d2, d6 vld1.8 {q5}, [r0], r3 @// Load row7 ;src1 vqrshrun.s16 d28, q10, #1 vld1.8 {q6}, [r1], r4 @// Load row6 ;src2 vqrshrun.s16 d29, q11, #1 vld1.8 {q7}, [r1], r4 @// Load row7 ;src2 vaddl.u8 q8, d8, d12 vaddl.u8 q9, d9, d13 vaddl.u8 q10, d10, d14 vqrshrun.s16 d30, q12, #1 vqrshrun.s16 d31, q13, #1 vst1.8 {q14}, [r2], r5 @//Store dest row4 vaddl.u8 q11, d11, d15 vst1.8 {q15}, [r2], r5 @//Store dest row5 vqrshrun.s16 d28, q8, #1 vqrshrun.s16 d30, q10, #1 vqrshrun.s16 d29, q9, #1 vld1.8 {q2}, [r1], r4 @// Load row8 ;src2 vqrshrun.s16 d31, q11, #1 vst1.8 {q14}, [r2], r5 @//Store dest row6 subs r12, r6, #8 vst1.8 {q15}, [r2], r5 @//Store dest row7 beq end_func @ end function if ht=8 vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 vaddl.u8 q10, d0, d4 vld1.8 {q1}, [r0], r3 @// Load row9 ;src1 vaddl.u8 q11, d1, d5 vld1.8 {q3}, [r1], r4 @// Load row9 ;src2 vqrshrun.s16 d28, q10, #1 vld1.8 {q4}, [r0], r3 @// Load row10 ;src1 vqrshrun.s16 d29, q11, #1 vld1.8 {q5}, [r0], r3 @// Load row11 ;src1 vaddl.u8 q12, d2, d6 vld1.8 {q6}, [r1], r4 @// Load row10 ;src2 vaddl.u8 q13, d3, d7 vld1.8 {q7}, [r1], r4 @// Load row11 ;src2 vaddl.u8 q8, d8, d12 vaddl.u8 q9, d9, d13 vaddl.u8 q10, d10, d14 vqrshrun.s16 d30, q12, #1 vst1.8 {q14}, [r2], r5 @//Store dest row8 vqrshrun.s16 d31, q13, #1 vst1.8 {q15}, [r2], r5 @//Store dest row9 vqrshrun.s16 d28, q8, #1 vld1.8 {q0}, [r0], r3 @// Load row12 ;src1 vaddl.u8 q11, d11, d15 vld1.8 {q1}, [r0], r3 @// Load row13 ;src1 vqrshrun.s16 d29, q9, #1 vld1.8 {q2}, [r1], r4 @// Load row12 ;src2 vqrshrun.s16 d30, q10, #1 vld1.8 {q3}, [r1], r4 @// Load row13 ;src2 vqrshrun.s16 d31, q11, #1 vst1.8 {q14}, [r2], r5 @//Store dest row10 vaddl.u8 q10, d0, d4 vst1.8 {q15}, [r2], r5 @//Store dest row11 vaddl.u8 q11, d1, d5 vld1.8 {q4}, [r0], r3 @// Load row14 ;src1 vaddl.u8 q13, d3, d7 vld1.8 {q5}, [r0], r3 @// Load row15 ;src1 vaddl.u8 q12, d2, d6 vld1.8 {q6}, [r1], r4 @// Load row14 ;src2 vaddl.u8 q8, d8, d12 vld1.8 {q7}, [r1], r4 @// Load row15 ;src2 vaddl.u8 q9, d9, d13 vqrshrun.s16 d28, q10, #1 vqrshrun.s16 d29, q11, #1 vaddl.u8 q10, d10, d14 vst1.8 {q14}, [r2], r5 @//Store dest row12 vqrshrun.s16 d30, q12, #1 vqrshrun.s16 d31, q13, #1 vaddl.u8 q11, d11, d15 vst1.8 {q15}, [r2], r5 @//Store dest row13 vqrshrun.s16 d28, q8, #1 vqrshrun.s16 d29, q9, #1 vqrshrun.s16 d30, q10, #1 vst1.8 {q14}, [r2], r5 @//Store dest row14 vqrshrun.s16 d31, q11, #1 vst1.8 {q15}, [r2], r5 @//Store dest row15 b end_func loop_8: @wd=8; vld1.8 {d0}, [r0], r3 @// Load row0 ;src1 vld1.8 {d4}, [r1], r4 @// Load row0 ;src2 vld1.8 {d1}, [r0], r3 @// Load row1 ;src1 vaddl.u8 q10, d0, d4 vld1.8 {d5}, [r1], r4 @// Load row1 ;src2 vld1.8 {d2}, [r0], r3 @// Load row2 ;src1 vqrshrun.s16 d28, q10, #1 vld1.8 {d6}, [r1], r4 @// Load row2 ;src2 vaddl.u8 q11, d1, d5 vld1.8 {d3}, [r0], r3 @// Load row3 ;src1 vaddl.u8 q12, d2, d6 vst1.8 {d28}, [r2], r5 @//Store dest row0 vqrshrun.s16 d29, q11, #1 vld1.8 {d7}, [r1], r4 @// Load row3 ;src2 vqrshrun.s16 d30, q12, #1 vst1.8 {d29}, [r2], r5 @//Store dest row1 vaddl.u8 q13, d3, d7 vst1.8 {d30}, [r2], r5 @//Store dest row2 vqrshrun.s16 d31, q13, #1 subs r12, r6, #4 vst1.8 {d31}, [r2], r5 @//Store dest row3 beq end_func @ end function if ht=4 vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 vld1.8 {d9}, [r0], r3 @// Load row5 ;src1 vaddl.u8 q8, d8, d12 vld1.8 {d13}, [r1], r4 @// Load row5 ;src2 vld1.8 {d10}, [r0], r3 @// Load row6;src1 vaddl.u8 q9, d9, d13 vld1.8 {d14}, [r1], r4 @// Load row6 ;src2 vqrshrun.s16 d28, q8, #1 vld1.8 {d11}, [r0], r3 @// Load row7 ;src1 vqrshrun.s16 d29, q9, #1 vst1.8 {d28}, [r2], r5 @//Store dest row4 vaddl.u8 q10, d10, d14 vst1.8 {d29}, [r2], r5 @//Store dest row5 vqrshrun.s16 d30, q10, #1 vld1.8 {d15}, [r1], r4 @// Load row7 ;src2 vaddl.u8 q11, d11, d15 vst1.8 {d30}, [r2], r5 @//Store dest row6 vqrshrun.s16 d31, q11, #1 subs r12, r6, #8 vst1.8 {d31}, [r2], r5 @//Store dest row7 beq end_func @ end function if ht=8 vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 vld1.8 {d1}, [r0], r3 @// Load row9 ;src1 vaddl.u8 q10, d0, d4 vld1.8 {d5}, [r1], r4 @// Load row9 ;src2 vld1.8 {d2}, [r0], r3 @// Load row10 ;src1 vaddl.u8 q11, d1, d5 vld1.8 {d6}, [r1], r4 @// Load row10 ;src2 vqrshrun.s16 d28, q10, #1 vld1.8 {d3}, [r0], r3 @// Load row11 ;src1 vaddl.u8 q12, d2, d6 vld1.8 {d7}, [r1], r4 @// Load row11 ;src2 vqrshrun.s16 d29, q11, #1 vld1.8 {d8}, [r0], r3 @// Load row12 ;src1 vaddl.u8 q13, d3, d7 vst1.8 {d28}, [r2], r5 @//Store dest row8 vqrshrun.s16 d30, q12, #1 vld1.8 {d12}, [r1], r4 @// Load row12 ;src2 vqrshrun.s16 d31, q13, #1 vst1.8 {d29}, [r2], r5 @//Store dest row9 vaddl.u8 q8, d8, d12 vld1.8 {d9}, [r0], r3 @// Load row13 ;src1 vqrshrun.s16 d28, q8, #1 vld1.8 {d13}, [r1], r4 @// Load row13 ;src2 vld1.8 {d10}, [r0], r3 @// Load row14;src1 vaddl.u8 q9, d9, d13 vld1.8 {d11}, [r0], r3 @// Load row15 ;src1 vld1.8 {d14}, [r1], r4 @// Load row14 ;src2 vqrshrun.s16 d29, q9, #1 vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2 vaddl.u8 q10, d10, d14 vst1.8 {d30}, [r2], r5 @//Store dest row10 vaddl.u8 q11, d11, d15 vst1.8 {d31}, [r2], r5 @//Store dest row11 vqrshrun.s16 d30, q10, #1 vst1.8 {d28}, [r2], r5 @//Store dest row12 vqrshrun.s16 d31, q11, #1 vst1.8 {d29}, [r2], r5 @//Store dest row13 vst1.8 {d30}, [r2], r5 @//Store dest row14 vst1.8 {d31}, [r2], r5 @//Store dest row15 b end_func loop_4: vld1.32 d0[0], [r0], r3 @// Load row0 ;src1 vld1.32 d4[0], [r1], r4 @// Load row0 ;src2 vld1.32 d1[0], [r0], r3 @// Load row1 ;src1 vaddl.u8 q10, d0, d4 vld1.32 d5[0], [r1], r4 @// Load row1 ;src2 vld1.32 d2[0], [r0], r3 @// Load row2 ;src1 vqrshrun.s16 d28, q10, #1 vld1.32 d6[0], [r1], r4 @// Load row2 ;src2 vaddl.u8 q11, d1, d5 vld1.32 d3[0], [r0], r3 @// Load row3 ;src1 vaddl.u8 q12, d2, d6 vst1.32 d28[0], [r2], r5 @//Store dest row0 vqrshrun.s16 d29, q11, #1 vld1.32 d7[0], [r1], r4 @// Load row3 ;src2 vqrshrun.s16 d30, q12, #1 vst1.32 d29[0], [r2], r5 @//Store dest row1 vaddl.u8 q13, d3, d7 vst1.32 d30[0], [r2], r5 @//Store dest row2 vqrshrun.s16 d31, q13, #1 subs r12, r6, #4 vst1.32 d31[0], [r2], r5 @//Store dest row3 beq end_func @ end function if ht=4 vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 vld1.32 d9[0], [r0], r3 @// Load row5 ;src1 vaddl.u8 q8, d8, d12 vld1.32 d13[0], [r1], r4 @// Load row5 ;src2 vld1.32 d10[0], [r0], r3 @// Load row6;src1 vaddl.u8 q9, d9, d13 vld1.32 d14[0], [r1], r4 @// Load row6 ;src2 vqrshrun.s16 d28, q8, #1 vld1.32 d11[0], [r0], r3 @// Load row7 ;src1 vqrshrun.s16 d29, q9, #1 vst1.32 d28[0], [r2], r5 @//Store dest row4 vaddl.u8 q10, d10, d14 vst1.32 d29[0], [r2], r5 @//Store dest row5 vqrshrun.s16 d30, q10, #1 vld1.32 d15[0], [r1], r4 @// Load row7 ;src2 vaddl.u8 q11, d11, d15 vst1.32 d30[0], [r2], r5 @//Store dest row6 vqrshrun.s16 d31, q11, #1 vst1.32 d31[0], [r2], r5 @//Store dest row7 end_func: vldmia sp!, {d8-d15} @ Restore neon registers that were saved ldmfd sp!, {r4-r12, pc} @Restoring registers from stack