@/****************************************************************************** @ * @ * Copyright (C) 2015 The Android Open Source Project @ * @ * Licensed under the Apache License, Version 2.0 (the "License"); @ * you may not use this file except in compliance with the License. @ * You may obtain a copy of the License at: @ * @ * http://www.apache.org/licenses/LICENSE-2.0 @ * @ * Unless required by applicable law or agreed to in writing, software @ * distributed under the License is distributed on an "AS IS" BASIS, @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ * See the License for the specific language governing permissions and @ * limitations under the License. @ * @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ @** @** @****************************************************************************** @* @* @* @brief @* This file contains definitions of routines that compute distortion @* between two macro/sub blocks of identical dimensions @* @* @author @* Ittiam @* @* @par List of Functions: @* - ime_compute_sad_16x16_a9q() @* - ime_compute_sad_16x16_fast_a9q() @* - ime_compute_sad_16x8_a9q() @* - ime_compute_sad_16x16_ea8_a9q() @* - ime_calculate_sad2_prog_a9q() @* - ime_calculate_sad3_prog_a9q() @* - ime_calculate_sad4_prog_a9q() @* - ime_sub_pel_compute_sad_16x16_a9q() @* - ime_compute_satqd_16x16_lumainter_a9q() @* - @* @remarks @* None @* @******************************************************************************* @ @** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) @* @* @par Description @* This functions computes SAD between 2 16x16 blocks. There is a provision @* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To @* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. @* @* @param[in] pu1_src @* UWORD8 pointer to the source @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] i4_max_sad @* integer maximum allowed distortion @* @* @param[in] pi4_mb_distortion @* integer evaluated sad @* @* @remarks @* @****************************************************************************** @* .text .p2align 2 .global ime_compute_sad_16x16_fast_a9q ime_compute_sad_16x16_fast_a9q: stmfd sp!, {r12, lr} vpush {d8-d15} lsl r2, r2, #1 lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] vld1.8 {d4, d5}, [r0], r2 vld1.8 {d6, d7}, [r1], r3 mov r12, #6 vld1.8 {d8, d9}, [r0], r2 vabdl.u8 q0, d6, d4 vabdl.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 loop_sad_16x16_fast: vld1.8 {d4, d5}, [r0], r2 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vld1.8 {d6, d7}, [r1], r3 subs r12, #2 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d6, d4 vabal.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 bne loop_sad_16x16_fast vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vadd.i16 q0, q0, q1 vadd.i16 d0, d1, d0 vpop {d8-d15} ldr r12, [sp, #12] vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 vshl.u32 d0, d0, #1 vst1.32 {d0[0]}, [r12] ldmfd sp!, {r12, pc} @** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x8 blocks @* @* @* @par Description @* This functions computes SAD between 2 16x8 blocks. There is a provision @* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To @* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. @* @* @param[in] pu1_src @* UWORD8 pointer to the source @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] u4_max_sad @* integer maximum allowed distortion @* @* @param[in] pi4_mb_distortion @* integer evaluated sad @* @* @remarks @* @****************************************************************************** @* @ .global ime_compute_sad_16x8_a9q ime_compute_sad_16x8_a9q: stmfd sp!, {r12, lr} @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] vld1.8 {d4, d5}, [r0], r2 vld1.8 {d6, d7}, [r1], r3 mov r12, #6 vpush {d8-d15} vld1.8 {d8, d9}, [r0], r2 vabdl.u8 q0, d6, d4 vabdl.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 loop_sad_16x8: vld1.8 {d4, d5}, [r0], r2 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vld1.8 {d6, d7}, [r1], r3 subs r12, #2 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d6, d4 vabal.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 bne loop_sad_16x8 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vadd.i16 q0, q0, q1 vadd.i16 d0, d1, d0 vpop {d8-d15} ldr r12, [sp, #12] vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 vst1.32 {d0[0]}, [r12] ldmfd sp!, {r12, pc} @** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks with early exit @* @* @par Description @* This functions computes SAD between 2 16x16 blocks. There is a provision @* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To @* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. @* @* @param[in] pu1_src @* UWORD8 pointer to the source @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] i4_max_sad @* integer maximum allowed distortion @* @* @param[in] pi4_mb_distortion @* integer evaluated sad @* @* @remarks @* @****************************************************************************** @* .global ime_compute_sad_16x16_ea8_a9q ime_compute_sad_16x16_ea8_a9q: stmfd sp!, {r5-r7, lr} lsl r2, r2, #1 lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] vld1.8 {d4, d5}, [r0], r2 vld1.8 {d6, d7}, [r1], r3 mov r5, #6 ldrd r6, r7, [sp, #16] vpush {d8-d15} vld1.8 {d8, d9}, [r0], r2 vabdl.u8 q0, d6, d4 vabdl.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 @r6 = i4_max_sad, r7 = pi4_mb_distortion loop_sad_16x16_ea8_1: vld1.8 {d4, d5}, [r0], r2 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vld1.8 {d6, d7}, [r1], r3 subs r5, #2 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d6, d4 vabal.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 bne loop_sad_16x16_ea8_1 vabal.u8 q0, d10, d8 sub r0, r0, r2, lsl #3 vabal.u8 q1, d11, d9 sub r1, r1, r3, lsl #3 vadd.i16 q6, q0, q1 add r0, r0, r2, asr #1 vadd.i16 d12, d12, d13 add r1, r1, r3, asr #1 vpaddl.u16 d12, d12 vld1.8 {d4, d5}, [r0], r2 vld1.8 {d6, d7}, [r1], r3 vpaddl.u32 d12, d12 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d6, d4 vabal.u8 q1, d7, d5 vst1.32 {d12[0]}, [r7] ldr r5, [r7] cmp r5, r6 bgt end_func_16x16_ea8 vld1.8 {d10, d11}, [r1], r3 mov r5, #6 loop_sad_16x16_ea8_2: vld1.8 {d4, d5}, [r0], r2 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vld1.8 {d6, d7}, [r1], r3 subs r5, #2 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d6, d4 vabal.u8 q1, d7, d5 vld1.8 {d10, d11}, [r1], r3 bne loop_sad_16x16_ea8_2 vabal.u8 q0, d10, d8 vabal.u8 q1, d11, d9 vadd.i16 q0, q0, q1 vadd.i16 d0, d1, d0 vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 vst1.32 {d0[0]}, [r7] end_func_16x16_ea8: vpop {d8-d15} ldmfd sp!, {r5-r7, pc} @* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad2_prog() @// @// Detail Description : This function find the sad values of 4 Progressive MBs @// at one shot @// @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- @* .global ime_calculate_sad2_prog_a9q ime_calculate_sad2_prog_a9q: @ r0 = ref1 <UWORD8 *> @ r1 = ref2 <UWORD8 *> @ r2 = src <UWORD8 *> @ r3 = RefBufferWidth <UWORD32> @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> stmfd sp!, {r4-r5, lr} ldr r4, [sp, #8] @ load src stride to r4 mov r5, #14 vpush {d8-d15} @Row 1 vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 vabdl.u8 q6, d2, d0 vabdl.u8 q7, d3, d1 vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 vabdl.u8 q8, d4, d0 vabdl.u8 q9, d5, d1 vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 loop_sad2_prog: subs r5, #2 @Row 1 vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 vabal.u8 q6, d8, d6 vabal.u8 q7, d9, d7 vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 vabal.u8 q8, d10, d6 vabal.u8 q9, d11, d7 vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 vabal.u8 q6, d2, d0 vabal.u8 q7, d3, d1 vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 vabal.u8 q8, d4, d0 vabal.u8 q9, d5, d1 vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 bne loop_sad2_prog vabal.u8 q6, d8, d6 vabal.u8 q7, d9, d7 vabal.u8 q8, d10, d6 vabal.u8 q9, d11, d7 @ Compute SAD vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 vadd.u16 d12, d12, d13 ldr r5, [sp, #16] @ loading pi4_sad to r5 vadd.u16 d16, d16, d17 vpadd.u16 d12, d12, d16 vpaddl.u16 d12, d12 vst1.64 {d12}, [r5]! vpop {d8-d15} ldmfd sp!, {r4-r5, pc} @* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad3_prog() @// @// Detail Description : This function find the sad values of 4 Progressive MBs @// at one shot @// @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- @* .global ime_calculate_sad3_prog_a9q ime_calculate_sad3_prog_a9q: @ r0 = ref1 <UWORD8 *> @ r1 = ref2 <UWORD8 *> @ r2 = ref3 <UWORD8 *> @ r3 = src <UWORD8 *> @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> stmfd sp!, {r4-r6, lr} ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 mov r6, #14 vpush {d8-d15} @Row 1 vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 vabdl.u8 q8, d2, d0 vabdl.u8 q9, d3, d1 vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 vabdl.u8 q10, d4, d0 vabdl.u8 q11, d5, d1 @Row 2 vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 vabdl.u8 q12, d6, d0 vabdl.u8 q13, d7, d1 vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 vabal.u8 q8, d10, d8 vabal.u8 q9, d11, d9 vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 vabal.u8 q10, d12, d8 vabal.u8 q11, d13, d9 loop_sad3_prog: @Row 1 vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 vabal.u8 q12, d14, d8 vabal.u8 q13, d15, d9 vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 vabal.u8 q8, d2, d0 vabal.u8 q9, d3, d1 vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 vabal.u8 q10, d4, d0 vabal.u8 q11, d5, d1 @Row 2 vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 vabal.u8 q12, d6, d0 vabal.u8 q13, d7, d1 vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 subs r6, #2 vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 vabal.u8 q8, d10, d8 vabal.u8 q9, d11, d9 vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 vabal.u8 q10, d12, d8 vabal.u8 q11, d13, d9 bne loop_sad3_prog vabal.u8 q12, d14, d8 vabal.u8 q13, d15, d9 @ Compute SAD vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 vadd.u16 d16, d16, d17 vadd.u16 d20, d20, d21 vadd.u16 d24, d24, d25 vpadd.u16 d16, d16, d20 vpadd.u16 d24, d24, d24 ldr r6, [sp, #24] @ loading pi4_sad to r6 vpaddl.u16 d16, d16 vpaddl.u16 d24, d24 vst1.64 {d16}, [r6]! vst1.32 {d24[0]}, [r6] vpop {d8-d15} ldmfd sp!, {r4-r6, pc} @** @****************************************************************************** @* @* @brief computes distortion (SAD) for sub-pel motion estimation @* @* @par Description @* This functions computes SAD for all the 8 half pel points @* @* @param[out] pi4_sad @* integer evaluated sad @* pi4_sad[0] - half x @* pi4_sad[1] - half x - 1 @* pi4_sad[2] - half y @* pi4_sad[3] - half y - 1 @* pi4_sad[4] - half xy @* pi4_sad[5] - half xy - 1 @* pi4_sad[6] - half xy - strd @* pi4_sad[7] - half xy - 1 - strd @* @* @remarks @* @****************************************************************************** @* .text .p2align 2 .global ime_sub_pel_compute_sad_16x16_a9q ime_sub_pel_compute_sad_16x16_a9q: stmfd sp!, {r4-r11, lr} @store register values to stack ldr r9, [sp, #36] ldr r10, [sp, #40] vpush {d8-d15} sub r4, r1, #1 @ x left sub r5, r2, r10 @ y top sub r6, r3, #1 @ xy left sub r7, r3, r10 @ xy top sub r8, r7, #1 @ xy top-left mov r11, #15 @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] vld1.8 {d0, d1}, [r0], r9 @ src vld1.8 {d2, d3}, [r5], r10 @ y top LOAD vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD vabdl.u8 q6, d2, d0 @ y top ABS1 vabdl.u8 q7, d4, d0 @ xy top ABS1 vld1.8 {d8, d9}, [r1], r10 @ x LOAD vabdl.u8 q8, d6, d0 @ xy top-left ABS1 vabdl.u8 q9, d8, d0 @ x ABS1 vld1.8 {d10, d11}, [r4], r10 @ x left LOAD vabal.u8 q6, d3, d1 @ y top ABS2 vabal.u8 q7, d5, d1 @ xy top ABS2 vld1.8 {d2, d3}, [r2], r10 @ y LOAD vabal.u8 q8, d7, d1 @ xy top-left ABS2 vabal.u8 q9, d9, d1 @ x ABS2 vld1.8 {d4, d5}, [r3], r10 @ xy LOAD vabdl.u8 q10, d10, d0 @ x left ABS1 vabdl.u8 q11, d2, d0 @ y ABS1 vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD vabdl.u8 q12, d4, d0 @ xy ABS1 vabdl.u8 q13, d6, d0 @ xy left ABS1 loop_sub_pel_16x16: vabal.u8 q10, d11, d1 @ x left ABS2 vabal.u8 q11, d3, d1 @ y ABS2 subs r11, #1 vabal.u8 q12, d5, d1 @ xy ABS2 vabal.u8 q13, d7, d1 @ xy left ABS2 vld1.8 {d0, d1}, [r0], r9 @ src vabal.u8 q6, d2, d0 @ y top ABS1 vabal.u8 q7, d4, d0 @ xy top ABS1 vld1.8 {d8, d9}, [r1], r10 @ x LOAD vabal.u8 q8, d6, d0 @ xy top-left ABS1 vabal.u8 q9, d8, d0 @ x ABS1 vld1.8 {d10, d11}, [r4], r10 @ x left LOAD vabal.u8 q6, d3, d1 @ y top ABS2 vabal.u8 q7, d5, d1 @ xy top ABS2 vld1.8 {d2, d3}, [r2], r10 @ y LOAD vabal.u8 q8, d7, d1 @ xy top-left ABS2 vabal.u8 q9, d9, d1 @ x ABS2 vld1.8 {d4, d5}, [r3], r10 @ xy LOAD vabal.u8 q10, d10, d0 @ x left ABS1 vabal.u8 q11, d2, d0 @ y ABS1 vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD vabal.u8 q12, d4, d0 @ xy ABS1 vabal.u8 q13, d6, d0 @ xy left ABS1 bne loop_sub_pel_16x16 vabal.u8 q10, d11, d1 @ x left ABS2 vabal.u8 q11, d3, d1 @ y ABS2 vabal.u8 q12, d5, d1 @ xy ABS2 vabal.u8 q13, d7, d1 @ xy left ABS2 vadd.i16 d0, d18, d19 @ x vadd.i16 d3, d12, d13 @ y top vadd.i16 d6, d14, d15 @ xy top vadd.i16 d5, d26, d27 @ xy left vadd.i16 d1, d20, d21 @ x left vadd.i16 d2, d22, d23 @ y vadd.i16 d4, d24, d25 @ xy vadd.i16 d7, d16, d17 @ xy top left vpadd.i16 d0, d0, d1 vpadd.i16 d2, d2, d3 vpadd.i16 d4, d4, d5 vpadd.i16 d6, d6, d7 vpaddl.u16 d0, d0 vpaddl.u16 d2, d2 vpop {d8-d15} ldr r11, [sp, #44] vpaddl.u16 d4, d4 vpaddl.u16 d6, d6 vst1.32 {d0}, [r11]! vst1.32 {d2}, [r11]! vst1.32 {d4}, [r11]! vst1.32 {d6}, [r11]! ldmfd sp!, {r4-r11, pc} @Restoring registers from stack @** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks @* @* @par Description @* This functions computes SAD between 2 16x16 blocks. There is a provision @* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To @* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. @* @* @param[in] pu1_src @* UWORD8 pointer to the source @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] i4_max_sad @* integer maximum allowed distortion @* @* @param[in] pi4_mb_distortion @* integer evaluated sad @* @* @remarks @* @****************************************************************************** @* .text .p2align 2 .global ime_compute_sad_16x16_a9q ime_compute_sad_16x16_a9q: @STMFD sp!,{r12,lr} stmfd sp!, {r12, r14} @store register values to stack @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] vld1.8 {d4, d5}, [r0], r2 vld1.8 {d6, d7}, [r1], r3 vpush {d8-d15} mov r12, #14 vld1.8 {d8, d9}, [r0], r2 vabdl.u8 q0, d4, d6 vld1.8 {d10, d11}, [r1], r3 vabdl.u8 q1, d5, d7 loop_sad_16x16: vld1.8 {d4, d5}, [r0], r2 vabal.u8 q0, d8, d10 vld1.8 {d6, d7}, [r1], r3 vabal.u8 q1, d9, d11 vld1.8 {d8, d9}, [r0], r2 vabal.u8 q0, d4, d6 subs r12, #2 vld1.8 {d10, d11}, [r1], r3 vabal.u8 q1, d5, d7 bne loop_sad_16x16 vabal.u8 q0, d8, d10 vabal.u8 q1, d9, d11 vadd.i16 q0, q0, q1 vadd.i16 d0, d1, d0 vpop {d8-d15} ldr r12, [sp, #12] vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 vst1.32 {d0[0]}, [r12] ldmfd sp!, {r12, pc} @Restoring registers from stack @* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad4_prog() @// @// Detail Description : This function find the sad values of 4 Progressive MBs @// at one shot @// @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- @* .global ime_calculate_sad4_prog_a9q ime_calculate_sad4_prog_a9q: @ r0 = temp_frame <UWORD8 *> @ r1 = buffer_ptr <UWORD8 *> @ r2 = RefBufferWidth <UWORD32> @ r3 = CurBufferWidth <UWORD32> @ stack = psad <UWORD32 *> {at 0x34} stmfd sp!, {r4-r7, lr} @UWORD8 *left_ptr = temp_frame - 1; @UWORD8 *right_ptr = temp_frame + 1; @UWORD8 *top_ptr = temp_frame - RefBufferWidth; @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; mov r7, #14 sub r4, r0, #0x01 @r4 = left_ptr add r5, r0, #0x1 @r5 = right_ptr sub r6, r0, r2 @r6 = top_ptr add r0, r0, r2 @r0 = bot_ptr @r1 = buffer_ptr vpush {d8-d15} @D0:D1 : buffer @D2:D3 : top @D4:D5 : left @D6:D7 : right @D8:D9 : bottom @Row 1 vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 vabdl.u8 q5, d2, d0 vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 vabdl.u8 q6, d3, d1 vabdl.u8 q7, d0, d4 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 vabdl.u8 q8, d1, d5 @Row 2 vabdl.u8 q9, d0, d6 vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 vabdl.u8 q10, d1, d7 vabdl.u8 q11, d0, d8 vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 vabdl.u8 q12, d1, d9 loop_sad4_prog: vabal.u8 q5, d26, d2 vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 vabal.u8 q6, d27, d3 vabal.u8 q7, d26, d4 vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 vabal.u8 q8, d27, d5 vabal.u8 q9, d26, d6 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 vabal.u8 q10, d27, d7 @Row 1 vabal.u8 q11, d26, d8 vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 vabal.u8 q12, d27, d9 vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 subs r7, #2 vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 vabal.u8 q5, d0, d2 vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 vabal.u8 q6, d1, d3 vabal.u8 q7, d0, d4 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 vabal.u8 q8, d1, d5 @Row 2 vabal.u8 q9, d0, d6 vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 vabal.u8 q10, d1, d7 vabal.u8 q11, d0, d8 vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 vabal.u8 q12, d1, d9 bne loop_sad4_prog vabal.u8 q5, d26, d2 vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 vabal.u8 q6, d27, d3 vabal.u8 q7, d26, d4 vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 vabal.u8 q8, d27, d5 vabal.u8 q9, d26, d6 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 vabal.u8 q10, d27, d7 vabal.u8 q11, d26, d8 vabal.u8 q12, d27, d9 @;Q5:Q6 : sad_top @;Q7:Q8 : sad_left @;Q9:Q10 : sad_right @;Q11:Q12 : sad_bot vadd.u16 q5, q5, q6 vadd.u16 q7, q7, q8 vadd.u16 q9, q9, q10 vadd.u16 q11, q11, q12 @; Free :- @; Q6,Q8,Q10,Q12 @;Q5 -> D10:D11 @;Q7 -> D14:D15 @;Q9 -> D18:D19 @;Q11 -> D22:D23 vadd.u16 d10, d10, d11 vadd.u16 d14, d14, d15 vadd.u16 d18, d18, d19 vadd.u16 d22, d22, d23 @;D10 : sad_top @;D14 : sad_left @;D18 : sad_right @;D22 : sad_bot vpaddl.u16 d11, d10 vpaddl.u16 d15, d14 vpaddl.u16 d19, d18 vpaddl.u16 d23, d22 @;D11 : sad_top @;D15 : sad_left @;D19 : sad_right @;D23 : sad_bot vpaddl.u32 d10, d11 vpaddl.u32 d22, d23 vpaddl.u32 d14, d15 vpaddl.u32 d18, d19 @;D10 : sad_top @;D14 : sad_left @;D18 : sad_right @;D22 : sad_bot ldr r4, [sp, #84] @;Can be rearranged vsli.64 d10, d22, #32 vsli.64 d14, d18, #32 vst1.64 {d14}, [r4]! vst1.64 {d10}, [r4]! vpop {d8-d15} ldmfd sp!, {r4-r7, pc} @***************************************************************************** @* @* Function Name : ime_compute_satqd_16x16_lumainter_a9 @* Description : This fucntion computes SAD for a 16x16 block. @ : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant @ @ Arguments : R0 :pointer to src buffer @ R1 :pointer to est buffer @ R2 :source stride @ R3 :est stride @ STACk :Threshold,distotion,is_nonzero @* @* Values Returned : NONE @* @* Register Usage : R0-R11 @* Stack Usage : @* Cycles : Around @* Interruptiaility : Interruptable @* @* Known Limitations @* \Assumptions : @* @* Revision History : @* DD MM YYYY Author(s) Changes @* 14 04 2014 Harinarayanan K K First version @* @***************************************************************************** .global ime_compute_satqd_16x16_lumainter_a9q ime_compute_satqd_16x16_lumainter_a9q: @R0 :pointer to src buffer @R1 :pointer to est buffer @R2 :Source stride @R3 :Pred stride @R4 :Threshold pointer @R5 :Distortion,ie SAD @R6 :is nonzero push {r4-r12, lr} @push all the variables first @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables ldr r4, [sp, #40] @load the threshold address vpush {d8-d15} mov r8, #8 @Number of 4x8 blocks to be processed mov r10, #0 @Sad mov r7, #0 @Nonzero info @---------------------------------------------------- vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 vld1.u16 {q11}, [r4] @I load the threhold vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 core_loop: @S1 S2 S3 S4 A1 A2 A3 A4 @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 ands r11, r8, #1 @II See if we are at even or odd block vadd.u16 q4 , q0, q3 @I Add r1 r4 lsl r11, r2, #2 @II Move back src 4 rows subeq r0, r0, r11 @II Move back src 4 rows if we are at even block vadd.u16 q5 , q1, q2 @I Add r2 r3 addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block lsl r11, r3, #2 @II Move back pred 4 rows vtrn.16 d8 , d10 @I trnspse 1 subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block vtrn.16 d9 , d11 @I trnspse 2 subne r0, r0, #8 @II Src 8clos back for odd rows subne r1, r1, #8 @II Pred 8 cols back for odd rows vtrn.32 d10, d11 @I trnspse 4 vtrn.32 d8 , d9 @I trnspse 3 vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 vadd.s16 q6, q4, q5 @I Get s1 s4 vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 vtrn.s16 d12, d13 @I Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 vshl.s16 q7, q6 , #1 @I si = si<<1 vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 @ D16 S14 A14 S23 A23 vrev32.16 d0, d16 @I vuzp.s16 d16, d0 @I @D16 S14 S23 A14 A23 vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 @D17 S12 S34 A12 A34 vrev32.16 q9, q7 @I Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 @D22 SAD1 SAD2 junk junk @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 vtrn.32 q8, q4 @I Rearrange to make ls of each block togather @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 vdup.s16 q6, d10[0] @I Get the sad blk 1 vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 vmov.s16 r9, d10[0] @I Get the sad for block 1 vsub.s16 q9, q7, q8 @I Add to the lss vmov.s16 r5, d10[1] @I Get the sad for block 2 vcle.s16 q7, q11, q9 @I Add to the lss vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 vdup.s16 q15, d10[1] @I Get the sad blk 1 vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 vsub.s16 q3, q14, q4 @I Add to the lss vcle.s16 q15, q11, q3 @I Add to the lss ADD R10, R10, R9 @I Add to the global sad blk 1 vtrn.u8 q15, q7 @I get all comparison bits to one reg vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 ADD R10, R10, R5 @I Add to the global sad blk 2 vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs cmp r11, r9 movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 vadd.u8 d28, d28, d29 @I Add the bits cmp r11, r5 @I Compare with threshold blk 2 movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 vpadd.u8 d28, d28, d29 @I Add the bits vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 orr r7, r7, r11 @I get the guy to r11 sub r8, r8, #1 @I Decremrnt block count cmp r7, #0 @I If we have atlest one non zero block bne compute_sad_only @I if a non zero block is der,From now on compute sad only cmp r8, #1 @I See if we are at the last block bne core_loop @I If the blocks are zero, lets continue the satdq @EPILOUGE for core loop @S1 S2 S3 S4 A1 A2 A3 A4 @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 vadd.u16 q4 , q0, q3 @Add r1 r4 vadd.u16 q5 , q1, q2 @Add r2 r3 @D8 S1 S2 S2 S1 @D10 S4 S3 S3 S4 @D9 A1 A2 A2 A1 @D11 A4 A3 A3 A4 vtrn.16 d8 , d10 @I trnspse 1 vtrn.16 d9 , d11 @I trnspse 2 vtrn.32 d8 , d9 @I trnspse 3 vtrn.32 d10, d11 @I trnspse 4 vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 vadd.s16 q6, q4, q5 @Get s1 s4 vtrn.s16 d12, d13 @Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 vshl.s16 q7, q6 , #1 @si = si<<1 vmov.s16 r9, d10[0] @Get the sad for block 1 vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) vmov.s16 r5, d10[1] @Get the sad for block 2 @D16 S14 A14 S23 A23 vrev32.16 d30, d16 @ vuzp.s16 d16, d30 @ @D16 S14 S23 A14 A23 vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) @D17 S12 S34 A12 A34 vrev32.16 q9, q7 @Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 @D22 SAD1 SAD2 junk junk vmov.u16 r9, d10[0] @Get the sad for block 1 vmov.u16 r5, d10[1] @Get the sad for block 2 @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 vtrn.32 q8, q4 @Rearrange to make ls of each block togather ADD R10, R10, R9 @Add to the global sad blk 1 @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 vld1.u16 {q11}, [r4] @load the threhold ADD R10, R10, R5 @Add to the global sad blk 2 vdup.u16 q6, d10[0] @Get the sad blk 1 cmp r11, r9 @Compare with threshold blk 1 vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 vsub.s16 q9, q7, q8 @Add to the lss vcle.s16 q15, q11, q9 @Add to the lss movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 cmp r11, r5 @Compare with threshold blk 2 vdup.u16 q14, d10[1] @Get the sad blk 1 vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 vsub.s16 q12, q13, q4 @Add to the lss vcle.s16 q14, q11, q12 @Add to the lss movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 vtrn.u8 q14, q15 @get all comparison bits to one reg vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs vadd.u8 d28, d28, d29 @Add the bits vpadd.u8 d28, d28, d29 @Add the bits vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 orr r7, r7, r11 @get the guy to r11 b funcend_sad_16x16 @Since all blocks ar processed nw, got to end compute_sad_only: @This block computes SAD only, so will be lighter @IT will start processign at n odd block @It will compute sad for odd blok, @and then for two blocks at a time @The counter is r7, hence r7 blocks will be processed and r11, r8, #1 @Get the last bit of counter cmp r11, #0 @See if we are at even or odd block @iif the blk is even we just have to set the pointer to the @start of current row lsleq r11, r2, #2 @I Move back src 4 rows subeq r0, r0, r11 @I Move back src 4 rows if we are at even block lsleq r11, r3, #2 @I Move back pred 4 rows subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block @ADDEQ R8,R8,#2 ;Inc counter beq skip_odd_blk @If the blk is odd we have to compute sad vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 vadd.u16 q6, q4, q5 @Add SAD of row 1-4 vadd.u16 d14, d12, d13 @Add Blk1 and blk2 vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 vpadd.u16 d18, d16, d17 @Add col 12-34 vmov.u16 r9, d18[0] @Move sad to arm ADD R10, R10, R9 @Add to the global sad sub r8, r8, #1 @Dec counter cmp r8, #0 @See if we processed last block beq funcend_sad_16x16 @if lprocessed last block goto end of func sub r0, r0, #8 @Since we processed od block move back src by 8 cols sub r1, r1, #8 @Since we processed od block move back pred by 8 cols skip_odd_blk: vmov.s16 q0, #0 @Initialize the accumulator vmov.s16 q1, #0 @Initialize the accumulator vld1.u8 {q15}, [r0], r2 @load src r1 vld1.u8 {q14}, [r1], r3 @load pred r1 vld1.u8 {q13}, [r0], r2 @load src r2 vld1.u8 {q12}, [r1], r3 @load pred r2 vld1.u8 {q11}, [r0], r2 @load src r3 vld1.u8 {q10}, [r1], r3 @load pred r2 vld1.u8 {q9}, [r0], r2 @load src r4 vld1.u8 {q8}, [r1], r3 @load pred r4 cmp r8, #2 beq sad_epilouge sad_loop: vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 vld1.u8 {q15}, [r0], r2 @II load r1 src vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 vld1.u8 {q14}, [r1], r3 @II load r1 pred vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 vld1.u8 {q13}, [r0], r2 @II load r3 src vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 vld1.u8 {q12}, [r1], r3 @II load r2 pred vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 vld1.u8 {q11}, [r0], r2 @II load r3 src vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 vld1.u8 {q10}, [r1], r3 @II load r3 pred vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 cmp r8, #2 @Check if last loop vld1.u8 {q9}, [r0], r2 @II load r4 src vld1.u8 {q8}, [r1], r3 @II load r4 pred bne sad_loop @Go back to SAD computation sad_epilouge: vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 vadd.u16 q2, q0, q1 @ADD two accumulators vadd.u16 d6, d4, d5 @Add two blk sad vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad vpadd.u16 d10, d8, d9 @Add col 12-34 sad vmov.u16 r9, d10[0] @move SAD to ARM ADD R10, R10, R9 @Add to the global sad funcend_sad_16x16: @End of fucntion process vpop {d8-d15} ldr r5, [sp, #44] ldr r6, [sp, #48] str r7, [r6] @Store the is zero reg str r10, [r5] @Store sad @SUB SP,SP,#40 pop {r4-r12, pc}