/****************************************************************************** * * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ihevc_quant_iquant_ssd_neon_intr.c * * @brief * Contains function definitions for quantization, followed by Inverse * quantization to find transform domain SSD * * @author * 100736 * * @par List of Functions: * - ihevc_quant_iquant_ssd_flat_scale_mat_neon() * - ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon() * * @remarks * * ******************************************************************************* */ /* System include files */ #include <stdio.h> #include <string.h> #include <stdlib.h> /* User include files */ #include "ihevc_typedefs.h" #include "ihevc_macros.h" #include "ihevc_platform_macros.h" #include "ihevc_defs.h" #include "ihevc_debug.h" #include "ihevc_trans_tables.h" #include "ihevc_quant_iquant_ssd.h" #include "ihevc_func_selector.h" #include "ihevc_trans_macros.h" #include "arm_neon.h" /*****************************************************************************/ /* Function Definitions */ /*****************************************************************************/ WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_neon( WORD16 *pi2_coeffs, WORD16 *pi2_quant_coeff, WORD16 *pi2_q_dst, WORD16 *pi2_iq_dst, WORD32 trans_size, WORD32 qp_div, WORD32 qp_rem, WORD32 q_add, WORD32 *pi4_quant_round_factor_0_1, WORD32 *pi4_quant_round_factor_1_2, WORD32 src_strd, WORD32 dst_q_strd, WORD32 dst_iq_strd, UWORD8 *csbf, WORD32 csbf_strd, WORD32 *zero_col, WORD32 *zero_row, WORD16 *pi2_dequant_coeff, LWORD64 *pi8_cost) { WORD32 i, j; WORD32 log2_size; WORD32 cbf = 0; WORD16 qm = 4; WORD16 bd = 8; WORD32 q_bits, tr, temp; WORD32 block_col = 0; WORD32 block_row = 0; WORD32 temp_zero_col = 0; WORD32 temp_zero_row = 0; WORD32 sh; WORD32 s_iq; WORD32 sh_tmp; // ssd int32x4_t ssd0 = vdupq_n_s32(0); int32x2_t ssd1; WORD32 ssd; // const const int16x4_t zero = vdup_n_s16(0); const int16x4_t zero_d = vdup_n_s16(0); const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]); const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem])); // src int16x4_t s0, s1, s2, s3; // q-iq int16x4_t q0, q1, q2, q3; int16x4_t iq0, iq1, iq2, iq3; // residue int32x4_t r0, r1, r2, r3; // sign uint16x4_t psgn0, psgn1, psgn2, psgn3; uint16x4_t nsgn0, nsgn1, nsgn2, nsgn3; // abs(src) int16x4_t abs_s0, abs_s1, abs_s2, abs_s3; // q-temp int32x4_t qtmp_0, qtmp_1, qtmp_2, qtmp_3; int16x4_t pq0, pq1, pq2, pq3; int16x4_t nq0, nq1, nq2, nq3; // iq-temp int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3; int32x4_t add_q; int32x4_t add_iq = vdupq_n_s32(1); int32x4_t sh_iq_1; int32x4_t sh_iq; int32x4_t q_v_bits; (void)pi4_quant_round_factor_0_1; (void)pi4_quant_round_factor_1_2; (void)pi2_dequant_coeff; GETRANGE(log2_size, trans_size); log2_size -= 1; tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size; q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT; temp = (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); q_v_bits = vdupq_n_s32(-q_bits); add_q = vdupq_n_s32(temp); sh = bd + log2_size - 5; sh_tmp = (sh - qp_div - 1); sh_iq_1 = vdupq_n_s32(sh_tmp); add_iq = vshlq_s32(add_iq, sh_iq_1); s_iq = (-(sh - qp_div)); sh_iq = vdupq_n_s32(s_iq); for(i = 0; i < trans_size; i += 4) { for(j = 0; j < trans_size; j += 4) { s0 = vld1_s16(pi2_coeffs + j); s1 = vld1_s16(pi2_coeffs + j + (src_strd)); s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd)); s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd)); /* quantization */ /* sign */ psgn0 = vcge_s16(s0, zero); psgn1 = vcge_s16(s1, zero); psgn2 = vcge_s16(s2, zero); psgn3 = vcge_s16(s3, zero); nsgn0 = vclt_s16(s0, zero); nsgn1 = vclt_s16(s1, zero); nsgn2 = vclt_s16(s2, zero); nsgn3 = vclt_s16(s3, zero); /* |src| */ abs_s0 = vabs_s16(s0); abs_s1 = vabs_s16(s1); abs_s2 = vabs_s16(s2); abs_s3 = vabs_s16(s3); /* tmp = tmp * quant_coeff */ qtmp_0 = vmull_s16(abs_s0, sq); qtmp_1 = vmull_s16(abs_s1, sq); qtmp_2 = vmull_s16(abs_s2, sq); qtmp_3 = vmull_s16(abs_s3, sq); /* tmp += (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)) */ qtmp_0 = vaddq_s32(qtmp_0, add_q); qtmp_1 = vaddq_s32(qtmp_1, add_q); qtmp_2 = vaddq_s32(qtmp_2, add_q); qtmp_3 = vaddq_s32(qtmp_3, add_q); /* tmp >>= q_bits; */ qtmp_0 = vshlq_s32(qtmp_0, q_v_bits); qtmp_1 = vshlq_s32(qtmp_1, q_v_bits); qtmp_2 = vshlq_s32(qtmp_2, q_v_bits); qtmp_3 = vshlq_s32(qtmp_3, q_v_bits); /* clip */ q0 = vqmovn_s32(qtmp_0); q1 = vqmovn_s32(qtmp_1); q2 = vqmovn_s32(qtmp_2); q3 = vqmovn_s32(qtmp_3); /* restore sign */ pq0 = vand_s16(q0, vreinterpret_s16_u16(psgn0)); pq1 = vand_s16(q1, vreinterpret_s16_u16(psgn1)); pq2 = vand_s16(q2, vreinterpret_s16_u16(psgn2)); pq3 = vand_s16(q3, vreinterpret_s16_u16(psgn3)); nq0 = vand_s16(q0, vreinterpret_s16_u16(nsgn0)); nq1 = vand_s16(q1, vreinterpret_s16_u16(nsgn1)); nq2 = vand_s16(q2, vreinterpret_s16_u16(nsgn2)); nq3 = vand_s16(q3, vreinterpret_s16_u16(nsgn3)); q0 = vsub_s16(pq0, nq0); q1 = vsub_s16(pq1, nq1); q2 = vsub_s16(pq2, nq2); q3 = vsub_s16(pq3, nq3); /* store */ vst1_s16((pi2_q_dst + j), q0); vst1_s16((pi2_q_dst + j + dst_q_strd), q1); vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), q2); vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), q3); *(csbf + block_col) = 0; if(vget_lane_s64(vreinterpret_s64_s16(q0), 0) || vget_lane_s64(vreinterpret_s64_s16(q1), 0) || vget_lane_s64(vreinterpret_s64_s16(q2), 0) || vget_lane_s64(vreinterpret_s64_s16(q3), 0)) { *(csbf + block_col) = 1; } if(*(csbf + block_col) == 1) { temp_zero_col |= (0xF << block_col * 4); temp_zero_row |= (0xF << block_row); /* inverse quantization */ iqtmp_0 = vmull_s16(q0, siq); iqtmp_1 = vmull_s16(q1, siq); iqtmp_2 = vmull_s16(q2, siq); iqtmp_3 = vmull_s16(q3, siq); iqtmp_0 = vaddq_s32(iqtmp_0, add_iq); iqtmp_1 = vaddq_s32(iqtmp_1, add_iq); iqtmp_2 = vaddq_s32(iqtmp_2, add_iq); iqtmp_3 = vaddq_s32(iqtmp_3, add_iq); iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq); iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq); iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq); iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq); /* clip */ iq0 = vqmovn_s32(iqtmp_0); iq1 = vqmovn_s32(iqtmp_1); iq2 = vqmovn_s32(iqtmp_2); iq3 = vqmovn_s32(iqtmp_3); /* store */ vst1_s16((pi2_iq_dst + j), iq0); vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1); vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2); vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3); /* ssd */ /* trans_coeff - inv.quant */ r0 = vsubl_s16(s0, iq0); r1 = vsubl_s16(s1, iq1); r2 = vsubl_s16(s2, iq2); r3 = vsubl_s16(s3, iq3); /* SD */ r0 = vmulq_s32(r0, r0); r1 = vmulq_s32(r1, r1); r2 = vmulq_s32(r2, r2); r3 = vmulq_s32(r3, r3); } else { /* store */ vst1_s16((pi2_iq_dst + j), zero_d); vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d); vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d); vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d); /* SD */ r0 = vmull_s16(s0, s0); r1 = vmull_s16(s1, s1); r2 = vmull_s16(s2, s2); r3 = vmull_s16(s3, s3); } /* SSD */ r0 = vaddq_s32(r0, r1); r2 = vaddq_s32(r2, r3); r0 = vaddq_s32(r0, r2); /* SSD Accumulation */ ssd0 = vaddq_s32(ssd0, r0); cbf = cbf || (*(csbf + block_col)); // cbf update block_col++; } block_col = 0; block_row += 4; csbf += csbf_strd; pi2_coeffs += 4 * src_strd; pi2_q_dst += 4 * dst_q_strd; pi2_iq_dst += 4 * dst_iq_strd; pi2_quant_coeff += 4 * trans_size; } /* SSD Computation */ ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0)); ssd1 = vpadd_s32(ssd1, ssd1); ssd = vget_lane_s32(ssd1, 0); *zero_col = ~temp_zero_col; //final zero_col storing *zero_row = ~temp_zero_row; //final zero_row storing /* Store the cost */ *pi8_cost = ssd; return cbf; } WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon( WORD16 *pi2_coeffs, WORD16 *pi2_quant_coeff, WORD16 *pi2_q_dst, WORD16 *pi2_iq_dst, WORD32 trans_size, WORD32 qp_div, /* qpscaled / 6 */ WORD32 qp_rem, /* qpscaled % 6 */ WORD32 q_add, WORD32 *pi4_quant_round_factor_0_1, WORD32 *pi4_quant_round_factor_1_2, WORD32 src_strd, WORD32 dst_q_strd, WORD32 dst_iq_strd, UWORD8 *csbf, WORD32 csbf_strd, WORD32 *zero_col, WORD32 *zero_row, WORD16 *pi2_dequant_coeff, LWORD64 *pi8_cost) { WORD32 i, j; WORD32 log2_size; WORD32 cbf = 0; WORD16 qm = 4; WORD16 bd = 8; WORD32 q_bits, tr; WORD32 block_col = 0; WORD32 block_row = 0; WORD32 temp_zero_col = 0; WORD32 temp_zero_row = 0; WORD32 sh; WORD32 s_iq; WORD32 sh_tmp; // ssd int32x4_t ssd0 = vdupq_n_s32(0); int32x2_t ssd1; WORD32 ssd; // const const int16x8_t zero = vdupq_n_s16(0); const int16x4_t zero_d = vdup_n_s16(0); const int16x8_t one = vdupq_n_s16(1); const int16x8_t two = vdupq_n_s16(2); const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]); const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem])); // src int16x4_t s0, s1, s2, s3; // sign uint16x8_t psgn0, psgn1; uint16x8_t nsgn0, nsgn1; int16x8_t pq0, pq1; int16x8_t nq0, nq1; // abs(src) int16x4_t abs_s0, abs_s1, abs_s2, abs_s3; // q-temp int32x4_t mul_0, mul_1, mul_2, mul_3; int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3; int16x8_t q_00, q_01; int16x8_t q_10, q_11; int16x8_t q_20, q_21; int16x8_t q_30, q_31; // cmp uint16x8_t cmp_00, cmp_01; uint16x8_t cmp_10, cmp_11; uint16x8_t cmp_20, cmp_21; // iq-temp int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3; int16x4_t iq0, iq1, iq2, iq3; //residue int32x4_t r0, r1, r2, r3; // add_q int32x4_t add_q; int32x4_t add_q0, add_q1, add_q2, add_q3; int32x4_t add_iq = vdupq_n_s32(1); int32x4_t sh_iq_1; int32x4_t sh_iq; int32x4_t q_v_bits; int32x4_t stmp; (void)q_add; (void)pi2_dequant_coeff; GETRANGE(log2_size, trans_size); log2_size -= 1; tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size; q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT; stmp = vdupq_n_s32(q_bits - QUANT_ROUND_FACTOR_Q); add_q = vdupq_n_s32((1 << QUANT_ROUND_FACTOR_Q) / 2); add_q = vshlq_s32(add_q, stmp); q_v_bits = vdupq_n_s32(-q_bits); sh = bd + log2_size - 5; sh_tmp = (sh - qp_div - 1); sh_iq_1 = vdupq_n_s32(sh_tmp); add_iq = vshlq_s32(add_iq, sh_iq_1); s_iq = (-(sh - qp_div)); sh_iq = vdupq_n_s32(s_iq); for(i = 0; i < trans_size; i += 4) { for(j = 0; j < trans_size; j += 4) { s0 = vld1_s16(pi2_coeffs + j); s1 = vld1_s16(pi2_coeffs + j + (src_strd)); s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd)); s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd)); /* quantization */ /* sign */ psgn0 = vcgeq_s16(vcombine_s16(s0, s1), zero); psgn1 = vcgeq_s16(vcombine_s16(s2, s3), zero); nsgn0 = vcltq_s16(vcombine_s16(s0, s1), zero); nsgn1 = vcltq_s16(vcombine_s16(s2, s3), zero); /* |src| */ abs_s0 = vabs_s16(s0); abs_s1 = vabs_s16(s1); abs_s2 = vabs_s16(s2); abs_s3 = vabs_s16(s3); /* tmp = tmp * quant_coeff */ mul_0 = vmull_s16(abs_s0, sq); mul_1 = vmull_s16(abs_s1, sq); mul_2 = vmull_s16(abs_s2, sq); mul_3 = vmull_s16(abs_s3, sq); /* qadd = 0 */ /* tmp >>= q_bits; */ q_tmp0 = vshlq_s32(mul_0, q_v_bits); q_tmp1 = vshlq_s32(mul_1, q_v_bits); q_tmp2 = vshlq_s32(mul_2, q_v_bits); q_tmp3 = vshlq_s32(mul_3, q_v_bits); /* clip */ q_00 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); q_01 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); /* compare qtmp_10, qtmp_20 with 2*/ cmp_00 = vcltq_s16(q_00, two); cmp_01 = vcltq_s16(q_01, two); /* qadd = (1 << QUANT_ROUND_FACTOR_Q)/2) */ /* tmp >>= q_bits; */ q_tmp0 = vaddq_s32(mul_0, add_q); q_tmp1 = vaddq_s32(mul_1, add_q); q_tmp2 = vaddq_s32(mul_2, add_q); q_tmp3 = vaddq_s32(mul_3, add_q); q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); /* clip */ q_10 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); q_11 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_00)), 0) || vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_00)), 0) || vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_01)), 0) || vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_01)), 0)) { /* qadd = *pi4_quant_round_factor_1_2 */ /* tmp >>= q_bits; */ add_q0 = vld1q_s32(pi4_quant_round_factor_1_2 + j); add_q1 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (trans_size)); add_q2 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (2 * trans_size)); add_q3 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (3 * trans_size)); add_q0 = vshlq_s32(add_q0, stmp); add_q1 = vshlq_s32(add_q1, stmp); add_q2 = vshlq_s32(add_q2, stmp); add_q3 = vshlq_s32(add_q3, stmp); q_tmp0 = vaddq_s32(mul_0, add_q0); q_tmp1 = vaddq_s32(mul_1, add_q1); q_tmp2 = vaddq_s32(mul_2, add_q2); q_tmp3 = vaddq_s32(mul_3, add_q3); q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); /* clip */ q_20 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); q_21 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); /* qadd = *pi4_quant_round_factor_0_1 */ /* tmp >>= q_bits; */ add_q0 = vld1q_s32(pi4_quant_round_factor_0_1 + j); add_q1 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (trans_size)); add_q2 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (2 * trans_size)); add_q3 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (3 * trans_size)); add_q0 = vshlq_s32(add_q0, stmp); add_q1 = vshlq_s32(add_q1, stmp); add_q2 = vshlq_s32(add_q2, stmp); add_q3 = vshlq_s32(add_q3, stmp); q_tmp0 = vaddq_s32(mul_0, add_q0); q_tmp1 = vaddq_s32(mul_1, add_q1); q_tmp2 = vaddq_s32(mul_2, add_q2); q_tmp3 = vaddq_s32(mul_3, add_q3); q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); /* clip */ q_30 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); q_31 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); /* compare qtmp_10, qtmp_20 with 1*/ cmp_10 = vcltq_s16(q_00, one); cmp_11 = vcltq_s16(q_01, one); cmp_20 = vbicq_u16(cmp_00, cmp_10); cmp_21 = vbicq_u16(cmp_01, cmp_11); q_10 = vbslq_s16(cmp_10, q_30, q_10); q_11 = vbslq_s16(cmp_11, q_31, q_11); q_10 = vbslq_s16(cmp_20, q_20, q_10); q_11 = vbslq_s16(cmp_21, q_21, q_11); } /* restore sign */ pq0 = vandq_s16(q_10, vreinterpretq_s16_u16(psgn0)); pq1 = vandq_s16(q_11, vreinterpretq_s16_u16(psgn1)); nq0 = vandq_s16(q_10, vreinterpretq_s16_u16(nsgn0)); nq1 = vandq_s16(q_11, vreinterpretq_s16_u16(nsgn1)); q_10 = vsubq_s16(pq0, nq0); q_11 = vsubq_s16(pq1, nq1); /* store */ vst1_s16((pi2_q_dst + j), vget_low_s16(q_10)); vst1_s16((pi2_q_dst + j + dst_q_strd), vget_high_s16(q_10)); vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), vget_low_s16(q_11)); vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), vget_high_s16(q_11)); *(csbf + block_col) = 0; if(vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_10)), 0) || vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_10)), 0) || vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_11)), 0) || vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_11)), 0)) { *(csbf + block_col) = 1; } if(*(csbf + block_col) == 1) { temp_zero_col |= (0xF << block_col * 4); temp_zero_row |= (0xF << block_row); /* inverse quantization */ iqtmp_0 = vmull_s16(vget_low_s16(q_10), siq); iqtmp_1 = vmull_s16(vget_high_s16(q_10), siq); iqtmp_2 = vmull_s16(vget_low_s16(q_11), siq); iqtmp_3 = vmull_s16(vget_high_s16(q_11), siq); iqtmp_0 = vaddq_s32(iqtmp_0, add_iq); iqtmp_1 = vaddq_s32(iqtmp_1, add_iq); iqtmp_2 = vaddq_s32(iqtmp_2, add_iq); iqtmp_3 = vaddq_s32(iqtmp_3, add_iq); iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq); iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq); iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq); iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq); /* clip */ iq0 = vqmovn_s32(iqtmp_0); iq1 = vqmovn_s32(iqtmp_1); iq2 = vqmovn_s32(iqtmp_2); iq3 = vqmovn_s32(iqtmp_3); /* store */ vst1_s16((pi2_iq_dst + j), iq0); vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1); vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2); vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3); /* ssd */ /* trans_coeff - inv.quant */ r0 = vsubl_s16(s0, iq0); r1 = vsubl_s16(s1, iq1); r2 = vsubl_s16(s2, iq2); r3 = vsubl_s16(s3, iq3); /* SD */ r0 = vmulq_s32(r0, r0); r1 = vmulq_s32(r1, r1); r2 = vmulq_s32(r2, r2); r3 = vmulq_s32(r3, r3); } else { /* store */ vst1_s16((pi2_iq_dst + j), zero_d); vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d); vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d); vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d); /* SD */ r0 = vmull_s16(s0, s0); r1 = vmull_s16(s1, s1); r2 = vmull_s16(s2, s2); r3 = vmull_s16(s3, s3); } /* SSD */ r0 = vaddq_s32(r0, r1); r2 = vaddq_s32(r2, r3); r0 = vaddq_s32(r0, r2); /* SSD Accumulation */ ssd0 = vaddq_s32(ssd0, r0); cbf = cbf || (*(csbf + block_col)); // cbf update block_col++; } block_col = 0; block_row += 4; csbf += csbf_strd; pi2_coeffs += 4 * src_strd; pi2_q_dst += 4 * dst_q_strd; pi2_iq_dst += 4 * dst_iq_strd; pi2_quant_coeff += 4 * trans_size; pi4_quant_round_factor_1_2 += 4 * trans_size; pi4_quant_round_factor_0_1 += 4 * trans_size; } /* SSD Computation */ ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0)); ssd1 = vpadd_s32(ssd1, ssd1); ssd = vget_lane_s32(ssd1, 0); *zero_col = ~temp_zero_col; //final zero_col storing *zero_row = ~temp_zero_row; //final zero_row storing /* Store the cost */ *pi8_cost = ssd; return cbf; }