/******************************************************************************
*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ihevc_resi_trans_neon.c
*
* @brief
* Contains definitions of functions for computing residue and fwd transform
*
* @author
* Ittiam
*
* @par List of Functions:
* - ihevc_resi_trans_4x4_neon()
* - ihevc_resi_trans_4x4_ttype1_neon()
* - ihevc_resi_trans_8x8_neon()
* - ihevc_resi_trans_16x16_neon()
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <string.h>
/* System user files */
#include "ihevc_typedefs.h"
#include "ihevc_macros.h"
#include "ihevc_defs.h"
#include "ihevc_cmn_utils_neon.h"
#include "ihevc_trans_tables.h"
#include "ihevc_resi_trans.h"
/*****************************************************************************/
/* Function Definitions */
/*****************************************************************************/
UWORD32 ihevc_resi_trans_4x4_neon(
UWORD8 *pu1_src,
UWORD8 *pu1_pred,
WORD32 *pi4_temp,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 pred_strd,
WORD32 dst_strd_chr_flag)
{
WORD32 chroma_flag = dst_strd_chr_flag & 1;
WORD32 dst_strd = dst_strd_chr_flag >> 16;
UWORD32 sad;
uint8x16_t inp_buf, pred_buf;
int16x8_t diff_1, diff_2;
int16x4_t diff_1_low, diff_1_high, diff_2_low, diff_2_high;
int16x8_t e_01, o_32;
int16x4_t e_0, e_1, o_0, o_1;
int32x4_t e_0_a_e_1, e_0_s_e_1;
int32x4_t temp1, temp2, temp3, temp4;
int32x4_t o_1_m_trans_10, o_1_m_trans_11;
int32x4_t e_03, e_12, o_03, o_12;
int16x4_t out_0, out_1, out_2, out_3;
uint16x8_t abs;
uint32x4_t b;
uint64x2_t c;
(void)pi4_temp;
if(chroma_flag == 0)
{
inp_buf = load_unaligned_u8q(pu1_src, src_strd);
pred_buf = load_unaligned_u8q(pu1_pred, pred_strd);
}
else
{
inp_buf = load_unaligned_u8qi(pu1_src, src_strd);
pred_buf = load_unaligned_u8qi(pu1_pred, pred_strd);
}
abs = vabdl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf));
abs = vabal_u8(abs, vget_high_u8(inp_buf), vget_high_u8(pred_buf));
b = vpaddlq_u16(abs);
c = vpaddlq_u32(b);
sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
vreinterpret_u32_u64(vget_high_u64(c))),
0);
diff_1 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf)));
diff_2 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(inp_buf), vget_high_u8(pred_buf)));
diff_1_low = vget_low_s16(diff_1);
diff_1_high = vget_high_s16(diff_1);
diff_2_low = vget_low_s16(diff_2);
diff_2_high = vget_high_s16(diff_2);
transpose_s16_4x4d(&diff_1_low, &diff_1_high, &diff_2_low, &diff_2_high);
diff_1 = vcombine_s16(diff_1_low, diff_1_high);
diff_2 = vcombine_s16(diff_2_high, diff_2_low);
e_01 = vaddq_s16(diff_1, diff_2);
o_32 = vsubq_s16(diff_1, diff_2);
e_0 = vget_low_s16(e_01);
e_1 = vget_high_s16(e_01);
o_0 = vget_high_s16(o_32);
o_1 = vget_low_s16(o_32);
e_0_a_e_1 = vaddl_s16(e_0, e_1);
e_0_s_e_1 = vsubl_s16(e_0, e_1);
temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
o_1_m_trans_10 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][0]);
o_1_m_trans_11 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][1]);
temp3 = vmlal_n_s16(o_1_m_trans_10, o_0, (WORD32)g_ai2_ihevc_trans_4[1][1]);
temp4 = vmlsl_n_s16(o_1_m_trans_11, o_0, (WORD32)g_ai2_ihevc_trans_4[1][0]);
transpose_s32_4x4(&temp1, &temp3, &temp2, &temp4);
e_03 = vaddq_s32(temp1, temp4);
e_12 = vaddq_s32(temp3, temp2);
o_03 = vsubq_s32(temp1, temp4);
o_12 = vsubq_s32(temp3, temp2);
e_0_a_e_1 = vaddq_s32(e_03, e_12);
e_0_s_e_1 = vsubq_s32(e_03, e_12);
temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
o_1_m_trans_10 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][0]);
o_1_m_trans_11 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][1]);
temp3 = vmlaq_n_s32(o_1_m_trans_10, o_12, (WORD32)g_ai2_ihevc_trans_4[1][1]);
temp4 = vmlsq_n_s32(o_1_m_trans_11, o_12, (WORD32)g_ai2_ihevc_trans_4[1][0]);
out_0 = vrshrn_n_s32(temp1, 9);
out_1 = vrshrn_n_s32(temp3, 9);
out_2 = vrshrn_n_s32(temp2, 9);
out_3 = vrshrn_n_s32(temp4, 9);
vst1_s16(pi2_dst, out_0);
vst1_s16(pi2_dst + dst_strd, out_1);
vst1_s16(pi2_dst + 2 * dst_strd, out_2);
vst1_s16(pi2_dst + 3 * dst_strd, out_3);
return sad;
}
/**
*******************************************************************************
*
* @brief
* This function performs residue calculation and forward transform type 1
* on input pixels
*
* @par Description:
* Performs residue calculation by subtracting source and prediction and
* followed by forward transform
*
* @param[in] pu1_src
* Input 4x4 pixels
*
* @param[in] pu1_pred
* Prediction data
*
* @param[in] pi2_tmp
* Temporary buffer of size 4x4
*
* @param[out] pi2_dst
* Output 4x4 coefficients
*
* @param[in] src_strd
* Input stride
*
* @param[in] pred_strd
* Prediction Stride
*
* @param[in] dst_strd_chr_flag
* Output Stride and Chroma Flag packed in the MS and LS 16-bit
* 0 - luma transform, 1 - chroma transform. Not used for 4x4ttyppe1
*
* @returns block sad
*
* @remarks
* None
*
*******************************************************************************
*/
UWORD32 ihevc_resi_trans_4x4_ttype1_neon(
UWORD8 *pu1_src,
UWORD8 *pu1_pred,
WORD32 *pi4_temp,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 pred_strd,
WORD32 dst_strd_chr_flag)
{
WORD32 dst_strd;
UWORD32 sad;
int16x4_t src0_4x16b;
int16x4_t src1_4x16b;
int16x4_t src2_4x16b;
int16x4_t src3_4x16b;
int32x4_t src0_4x32b;
int32x4_t src1_4x32b;
int32x4_t src2_4x32b;
int32x4_t src3_4x32b;
/*load source and pred values */
const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
const uint8x16_t pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
const int16x8_t src_reg0 =
vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
const int16x8_t src_reg1 =
vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
int32x4_t add_val = vdupq_n_s32(1);
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8));
uint32x4_t b;
uint64x2_t c;
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(pred_u8));
b = vpaddlq_u16(abs);
c = vpaddlq_u32(b);
sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
vreinterpret_u32_u64(vget_high_u64(c))),
0);
(void)pi4_temp;
dst_strd = dst_strd_chr_flag >> 16;
/************************* 4x4 16bit Transpose ***********************/
src0_4x16b = vget_low_s16(src_reg0);
src1_4x16b = vget_high_s16(src_reg0);
src2_4x16b = vget_low_s16(src_reg1);
src3_4x16b = vget_high_s16(src_reg1);
transpose_s16_4x4d(&src0_4x16b, &src1_4x16b, &src2_4x16b, &src3_4x16b);
/************************** 4x4 Transpose End *************************/
/* Residue + Forward Transform 1st stage */
/* coeff2_4x32b = 74 74 74 74 */
const int32x4_t coeff2_4x32b =
vdupq_n_s32(74); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[2][0]);
/* coeff0_4x32b = 29 29 29 29 */
const int32x4_t coeff0_4x32b =
vdupq_n_s32(29); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[0][0]);
/* coeff1_4x32b = 55 55 55 55 */
const int32x4_t coeff1_4x32b =
vdupq_n_s32(55); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[1][0]);
/* c0 to c3 calculation */
int32x4_t c0_4x32b = vaddl_s16(src0_4x16b, src3_4x16b); /* r0+r3 */
int32x4_t c1_4x32b = vaddl_s16(src1_4x16b, src3_4x16b); /* r1+r3 */
int32x4_t c2_4x32b = vsubl_s16(src0_4x16b, src1_4x16b); /* r0-r1 */
int32x4_t c3_4x32b = vmulq_s32(vmovl_s16(src2_4x16b), coeff2_4x32b); /* 74*r2 */
src0_4x16b = vadd_s16(src0_4x16b, src1_4x16b); /* r0+r1 */
src1_4x32b = vsubl_s16(src0_4x16b, src3_4x16b); /* r0+r1-r3 */
src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
/* result + add */
src1_4x32b = vaddq_s32(src1_4x32b, add_val);
src0_4x32b = vaddq_s32(src0_4x32b, add_val);
src2_4x32b = vaddq_s32(src2_4x32b, add_val);
src3_4x32b = vaddq_s32(src3_4x32b, add_val);
/* result >> shift */
src1_4x32b = vshrq_n_s32(src1_4x32b, 1);
src0_4x32b = vshrq_n_s32(src0_4x32b, 1);
src2_4x32b = vshrq_n_s32(src2_4x32b, 1);
src3_4x32b = vshrq_n_s32(src3_4x32b, 1);
/* Forward transform 2nd stage */
{
/************************* 4x4 32bit Transpose ***********************/
transpose_s32_4x4(&src0_4x32b, &src1_4x32b, &src2_4x32b, &src3_4x32b);
/************************** 4x4 Transpose End *************************/
/* add value */
add_val = vdupq_n_s32(128);
c0_4x32b = vaddq_s32(src0_4x32b, src3_4x32b); /* r0+r3 */
c1_4x32b = vaddq_s32(src1_4x32b, src3_4x32b); /* r1+r3 */
c2_4x32b = vsubq_s32(src0_4x32b, src1_4x32b); /* r0-r1 */
c3_4x32b = vmulq_s32(src2_4x32b, coeff2_4x32b); /* 74*r2 */
src1_4x32b = vaddq_s32(src0_4x32b, src1_4x32b); /* r0+r1 */
src1_4x32b = vsubq_s32(src1_4x32b, src3_4x32b); /* r0+r1-r3 */
src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
/* result + add */
src1_4x32b = vaddq_s32(src1_4x32b, add_val);
src0_4x32b = vaddq_s32(src0_4x32b, add_val);
src2_4x32b = vaddq_s32(src2_4x32b, add_val);
src3_4x32b = vaddq_s32(src3_4x32b, add_val);
src1_4x32b = vshrq_n_s32(src1_4x32b, 8);
src0_4x32b = vshrq_n_s32(src0_4x32b, 8);
src2_4x32b = vshrq_n_s32(src2_4x32b, 8);
src3_4x32b = vshrq_n_s32(src3_4x32b, 8);
vst1_s16((pi2_dst + dst_strd), vmovn_s32(src1_4x32b));
vst1_s16(pi2_dst, vmovn_s32(src0_4x32b));
vst1_s16((pi2_dst + 2 * dst_strd), vmovn_s32(src2_4x32b));
vst1_s16((pi2_dst + 3 * dst_strd), vmovn_s32(src3_4x32b));
}
return sad;
}
/**
*******************************************************************************
*
* @brief
* This function performs residue calculation and forward transform on
* input pixels
*
* @par Description:
* Performs residue calculation by subtracting source and prediction and
* followed by forward transform
*
* @param[in] pu1_src
* Input 8x8 pixels
*
* @param[in] pu1_pred
* Prediction data
*
* @param[in] pi2_tmp
* Temporary buffer of size 8x8
*
* @param[out] pi2_dst
* Output 8x8 coefficients
*
* @param[in] src_strd
* Input stride
*
* @param[in] pred_strd
* Prediction Stride
*
* @param[in] dst_strd_chr_flag
* Output Stride and Chroma Flag packed in the MS and LS 16-bit
*
* @returns Void
*
* @remarks
* None
*
*******************************************************************************
*/
UWORD32 ihevc_resi_trans_8x8_neon(
UWORD8 *pu1_src,
UWORD8 *pu1_pred,
WORD32 *pi4_temp,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 pred_strd,
WORD32 dst_strd_chr_flag)
{
int16x8_t diff_16[8];
int16x8_t abs = vdupq_n_s16(0);
int32x4_t tmp_a;
int64x2_t tmp_b;
int32x2_t sad_v;
int32x4x2_t a0, a1, a2, a3, a4, a5, a6, a7;
int chroma_flag = dst_strd_chr_flag & 1;
int dst_strd = dst_strd_chr_flag >> 16;
UWORD32 sad;
(void)pi4_temp;
#define RESIDUE(k, is_chroma) \
if(!is_chroma) \
{ \
const uint8x8_t s##k = vld1_u8(pu1_src); \
const uint8x8_t p##k = vld1_u8(pu1_pred); \
diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k)); \
pu1_src += src_strd; \
pu1_pred += pred_strd; \
abs = vaddq_s16(abs, vabsq_s16(diff_16[k])); \
} \
else \
{ \
const uint8x8_t s##k = vld2_u8(pu1_src).val[0]; \
const uint8x8_t p##k = vld2_u8(pu1_pred).val[0]; \
diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k)); \
pu1_src += src_strd; \
pu1_pred += pred_strd; \
abs = vaddq_s16(abs, vabsq_s16(diff_16[k])); \
}
// stage 1
RESIDUE(0, chroma_flag);
RESIDUE(1, chroma_flag);
RESIDUE(2, chroma_flag);
RESIDUE(3, chroma_flag);
RESIDUE(4, chroma_flag);
RESIDUE(5, chroma_flag);
RESIDUE(6, chroma_flag);
RESIDUE(7, chroma_flag);
tmp_a = vpaddlq_s16(abs);
tmp_b = vpaddlq_s32(tmp_a);
sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_b)),
vreinterpret_s32_s64(vget_high_s64(tmp_b)));
sad = vget_lane_s32(sad_v, 0);
transpose_s16_8x8(
&diff_16[0],
&diff_16[1],
&diff_16[2],
&diff_16[3],
&diff_16[4],
&diff_16[5],
&diff_16[6],
&diff_16[7]);
{
const int16x8_t o3 = vsubq_s16(diff_16[3], diff_16[4]); /*C3 - C4*/
const int16x8_t o2 = vsubq_s16(diff_16[2], diff_16[5]); /*C2 - C5*/
const int16x8_t o1 = vsubq_s16(diff_16[1], diff_16[6]); /*C1 - C6*/
const int16x8_t o0 = vsubq_s16(diff_16[0], diff_16[7]); /*C0 - C7*/
const int16x8_t e0 = vaddq_s16(diff_16[0], diff_16[7]); /*C0 + C7*/
const int16x8_t e1 = vaddq_s16(diff_16[1], diff_16[6]); /*C1 + C6*/
const int16x8_t e2 = vaddq_s16(diff_16[2], diff_16[5]); /*C2 + C5*/
const int16x8_t e3 = vaddq_s16(diff_16[3], diff_16[4]); /*C3 + C4*/
const int16x8_t ee0 = vaddq_s16(e0, e3); /*C0 + C3 + C4 + C7*/
const int16x8_t ee1 = vaddq_s16(e1, e2); /*C1 + C2 + C5 + C6*/
const int16x8_t eo0 = vsubq_s16(e0, e3); /*C0 - C3 - C4 + C7*/
const int16x8_t eo1 = vsubq_s16(e1, e2); /*C1 - C2 - C5 + C6*/
/*C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7*/
const int16x8_t eee = vaddq_s16(ee1, ee0);
/*C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7*/
const int16x8_t eeo = vsubq_s16(ee0, ee1);
/*F2[0] of 83*(C0 - C3 - C4 + C7)*/
a2.val[0] = vmull_n_s16(vget_low_s16(eo0), 83);
/*F6[0] of 36*(C0 - C3 - C4 + C7)*/
a6.val[0] = vmull_n_s16(vget_low_s16(eo0), 36);
/*F2[1] of 83*(C0 - C3 - C4 + C7)*/
a2.val[1] = vmull_n_s16(vget_high_s16(eo0), 83);
/*F6[1] of 36*(C0 - C3 - C4 + C7)*/
a6.val[1] = vmull_n_s16(vget_high_s16(eo0), 36);
/*F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
a6.val[1] = vmlsl_n_s16(a6.val[1], vget_high_s16(eo1), 83);
/*F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
a2.val[1] = vmlal_n_s16(a2.val[1], vget_high_s16(eo1), 36);
/*F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
a6.val[0] = vmlsl_n_s16(a6.val[0], vget_low_s16(eo1), 83);
/*F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
a2.val[0] = vmlal_n_s16(a2.val[0], vget_low_s16(eo1), 36);
/*F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
a0.val[0] = vshll_n_s16(vget_low_s16(eee), 6);
/*F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
a0.val[1] = vshll_n_s16(vget_high_s16(eee), 6);
/*F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
a4.val[0] = vshll_n_s16(vget_low_s16(eeo), 6);
/*F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
a4.val[1] = vshll_n_s16(vget_high_s16(eeo), 6);
a7.val[0] = vmull_n_s16(vget_low_s16(o0), 18); /*F7[0] = 18*(C0 - C7)*/
a5.val[0] = vmull_n_s16(vget_low_s16(o0), 50); /*F5[0] = 50*(C0 - C7)*/
a3.val[0] = vmull_n_s16(vget_low_s16(o0), 75); /*F3[0] = 75*(C0 - C7)*/
a1.val[0] = vmull_n_s16(vget_low_s16(o0), 89); /*F1[0] = 89*(C0 - C7)*/
a1.val[1] = vmull_n_s16(vget_high_s16(o0), 89); /*F1[1] = 89*(C0 - C7)*/
a3.val[1] = vmull_n_s16(vget_high_s16(o0), 75); /*F3[1] = 75*(C0 - C7)*/
a5.val[1] = vmull_n_s16(vget_high_s16(o0), 50); /*F5[1] = 50*(C0 - C7)*/
a7.val[1] = vmull_n_s16(vget_high_s16(o0), 18); /*F7[1] = 18*(C0 - C7)*/
/*F7[0] = 18*(C0 - C7) - 50*(C1 - C6)*/
a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o1), 50);
/*F5[0] = 50*(C0 - C7) - 89*(C1 - C6)*/
a5.val[0] = vmlsl_n_s16(a5.val[0], vget_low_s16(o1), 89);
/*F3[0] = 75*(C0 - C7) - 18*(C1 - C6)*/
a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o1), 18);
/*F1[0] = 89*(C0 - C7) + 75*(C1 - C6)*/
a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o1), 75);
/*F1[1] = 89*(C0 - C7) + 75*(C1 - C6)*/
a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o1), 75);
/*F3[1] = 75*(C0 - C7) - 18*(C1 - C6)*/
a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o1), 18);
/*F5[1] = 50*(C0 - C7) - 89*(C1 - C6)*/
a5.val[1] = vmlsl_n_s16(a5.val[1], vget_high_s16(o1), 89);
/*F7[1] = 18*(C0 - C7) - 50*(C1 - C6)*/
a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o1), 50);
/*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
a7.val[0] = vmlal_n_s16(a7.val[0], vget_low_s16(o2), 75);
/*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o2), 18);
/*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o2), 89);
/*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o2), 50);
/*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o2), 50);
/*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o2), 89);
/*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o2), 18);
/*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
a7.val[1] = vmlal_n_s16(a7.val[1], vget_high_s16(o2), 75);
/*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o3), 89);
/*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o3), 75);
/*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o3), 50);
/*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o3), 18);
/*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o3), 18);
/*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o3), 50);
/*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o3), 75);
/*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o3), 89);
}
//Stage 2
{
int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
int32x4_t e0_2, e1_2, e2_2, e3_2;
int32x4_t o0_2, o1_2, o2_2, o3_2;
int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
/*Transposing second half of transform stage 1 (1)*/
int32x4x2_t b1 = vtrnq_s32(a0.val[1], a1.val[1]);
int32x4x2_t b3 = vtrnq_s32(a2.val[1], a3.val[1]);
int32x4x2_t b0 = vtrnq_s32(a0.val[0], a1.val[0]);
int32x4x2_t b2 = vtrnq_s32(a2.val[0], a3.val[0]);
/*Transposing second half of transform stage 1 (2)*/
a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
/* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
h4 = vsubq_s32(ee0_2, ee1_2);
/* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
h0 = vaddq_s32(ee0_2, ee1_2);
/* Truncating last 11 bits in H0*/
row0 = vrshrn_n_s32(h0, 5);
/*First half-row of row 1 of transform stage 2 (H0) stored*/
vst1_s16(pi2_dst, row0);
/* Truncating last 11 bits in H4*/
row4 = vrshrn_n_s32(h4, 5);
/*First half-row of row 5 of transform stage 2 (H4) stored*/
vst1_s16(pi2_dst + 4 * dst_strd, row4);
/* F6 = 36*(B0 - B3 - B4 + B7) */
h6 = vmulq_n_s32(eo0_2, 36);
/* F2 = 83*(B0 - B3 - B4 + B7) */
h2 = vmulq_n_s32(eo0_2, 83);
/*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
h2 = vmlaq_n_s32(h2, eo1_2, 36);
/*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
h6 = vmlsq_n_s32(h6, eo1_2, 83);
/* Truncating last 11 bits in H6*/
row6 = vrshrn_n_s32(h6, 11);
/*First half-row of row 7 of transform stage 2 (H6) stored*/
vst1_s16(pi2_dst + 6 * dst_strd, row6);
/* Truncating last 11 bits in H2*/
row2 = vrshrn_n_s32(h2, 11);
/*First half-row of row 3 of transform stage 2 (H2) stored*/
vst1_s16(pi2_dst + 2 * dst_strd, row2);
h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
/* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
h1 = vmlaq_n_s32(h1, o2_2, 50);
/* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
h3 = vmlsq_n_s32(h3, o2_2, 89);
/* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
h5 = vmlaq_n_s32(h5, o2_2, 18);
/* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
h7 = vmlaq_n_s32(h7, o2_2, 75);
/* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
h7 = vmlsq_n_s32(h7, o3_2, 89);
/* Truncating last 11 bits in H7*/
row7 = vrshrn_n_s32(h7, 11);
/*First half-row of row 8 of transform stage 2 (H7) stored*/
vst1_s16(pi2_dst + 7 * dst_strd, row7);
/* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
h5 = vmlaq_n_s32(h5, o3_2, 75);
/* Truncating last 11 bits in H5*/
row5 = vrshrn_n_s32(h5, 11);
/*First half-row of row 6 of transform stage 2 (H5) stored*/
vst1_s16(pi2_dst + 5 * dst_strd, row5);
/* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
h3 = vmlsq_n_s32(h3, o3_2, 50);
/* Truncating last 11 bits in H3*/
row3 = vrshrn_n_s32(h3, 11);
/*First half-row of row 4 of transform stage 2 (H3) stored*/
vst1_s16(pi2_dst + 3 * dst_strd, row3);
/* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
h1 = vmlaq_n_s32(h1, o3_2, 18);
/* Truncating last 11 bits in H1*/
row1 = vrshrn_n_s32(h1, 11);
/*First half-row of row 2 of transform stage 2 (H1) stored*/
vst1_s16(pi2_dst + dst_strd, row1);
}
pi2_dst += 4;
{
int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
int32x4_t e0_2, e1_2, e2_2, e3_2;
int32x4_t o0_2, o1_2, o2_2, o3_2;
int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
/*Transposing second half of transform stage 1 (1)*/
int32x4x2_t b1 = vtrnq_s32(a4.val[1], a5.val[1]);
int32x4x2_t b3 = vtrnq_s32(a6.val[1], a7.val[1]);
int32x4x2_t b0 = vtrnq_s32(a4.val[0], a5.val[0]);
int32x4x2_t b2 = vtrnq_s32(a6.val[0], a7.val[0]);
/*Transposing second half of transform stage 1 (2)*/
a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
/* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
h4 = vsubq_s32(ee0_2, ee1_2);
/* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
h0 = vaddq_s32(ee0_2, ee1_2);
/* Truncating last 11 bits in H0*/
row0 = vrshrn_n_s32(h0, 5);
/*First half-row of row 1 of transform stage 2 (H0) stored*/
vst1_s16(pi2_dst, row0);
/* Truncating last 11 bits in H4*/
row4 = vrshrn_n_s32(h4, 5);
/*First half-row of row 5 of transform stage 2 (H4) stored*/
vst1_s16(pi2_dst + 4 * dst_strd, row4);
/* F6 = 36*(B0 - B3 - B4 + B7) */
h6 = vmulq_n_s32(eo0_2, 36);
/* F2 = 83*(B0 - B3 - B4 + B7) */
h2 = vmulq_n_s32(eo0_2, 83);
/*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
h2 = vmlaq_n_s32(h2, eo1_2, 36);
/*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
h6 = vmlsq_n_s32(h6, eo1_2, 83);
/* Truncating last 11 bits in H6*/
row6 = vrshrn_n_s32(h6, 11);
/*First half-row of row 7 of transform stage 2 (H6) stored*/
vst1_s16(pi2_dst + 6 * dst_strd, row6);
/* Truncating last 11 bits in H2*/
row2 = vrshrn_n_s32(h2, 11);
/*First half-row of row 3 of transform stage 2 (H2) stored*/
vst1_s16(pi2_dst + 2 * dst_strd, row2);
h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
/* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
h1 = vmlaq_n_s32(h1, o2_2, 50);
/* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
h3 = vmlsq_n_s32(h3, o2_2, 89);
/* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
h5 = vmlaq_n_s32(h5, o2_2, 18);
/* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
h7 = vmlaq_n_s32(h7, o2_2, 75);
/* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
h7 = vmlsq_n_s32(h7, o3_2, 89);
/* Truncating last 11 bits in H7*/
row7 = vrshrn_n_s32(h7, 11);
/*First half-row of row 8 of transform stage 2 (H7) stored*/
vst1_s16(pi2_dst + 7 * dst_strd, row7);
/* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
h5 = vmlaq_n_s32(h5, o3_2, 75);
/* Truncating last 11 bits in H5*/
row5 = vrshrn_n_s32(h5, 11);
/*First half-row of row 6 of transform stage 2 (H5) stored*/
vst1_s16(pi2_dst + 5 * dst_strd, row5);
/* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
h3 = vmlsq_n_s32(h3, o3_2, 50);
/* Truncating last 11 bits in H3*/
row3 = vrshrn_n_s32(h3, 11);
/*First half-row of row 4 of transform stage 2 (H3) stored*/
vst1_s16(pi2_dst + 3 * dst_strd, row3);
/* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
h1 = vmlaq_n_s32(h1, o3_2, 18);
/* Truncating last 11 bits in H1*/
row1 = vrshrn_n_s32(h1, 11);
/*First half-row of row 2 of transform stage 2 (H1) stored*/
vst1_s16(pi2_dst + dst_strd, row1);
}
return sad;
}
static INLINE void load(const uint8_t *a, int stride, uint8x8_t *b, int is_chroma)
{
int i;
if(is_chroma == 0)
{
for (i = 0; i < 16; i++)
{
b[i] = vld1_u8(a);
a += stride;
}
}
else
{
for (i = 0; i < 16; i++)
{
b[i] = vld2_u8(a).val[0];
a += stride;
}
}
}
// Store 8 16x8 values, assuming stride == 16.
static INLINE void store(WORD16 *a, int16x8_t *b /*[8]*/)
{
int i;
for (i = 0; i < 8; i++)
{
vst1q_s16(a, b[i]);
a += 16;
}
}
static INLINE void cross_input_16(int16x8_t *a /*[16]*/, int16x8_t *b /*[16]*/)
{
b[0] = vaddq_s16(a[0], a[15]);
b[1] = vaddq_s16(a[1], a[14]);
b[2] = vaddq_s16(a[2], a[13]);
b[3] = vaddq_s16(a[3], a[12]);
b[4] = vaddq_s16(a[4], a[11]);
b[5] = vaddq_s16(a[5], a[10]);
b[6] = vaddq_s16(a[6], a[9]);
b[7] = vaddq_s16(a[7], a[8]);
b[8] = vsubq_s16(a[7], a[8]);
b[9] = vsubq_s16(a[6], a[9]);
b[10] = vsubq_s16(a[5], a[10]);
b[11] = vsubq_s16(a[4], a[11]);
b[12] = vsubq_s16(a[3], a[12]);
b[13] = vsubq_s16(a[2], a[13]);
b[14] = vsubq_s16(a[1], a[14]);
b[15] = vsubq_s16(a[0], a[15]);
}
static INLINE void cross_input_32(int32x4x2_t *a /*[16][2]*/, int32x4x2_t *b /*[16][2]*/)
{
WORD32 i;
for(i = 0; i < 2; i++)
{
b[0].val[i] = vaddq_s32(a[0].val[i], a[15].val[i]);
b[1].val[i] = vaddq_s32(a[1].val[i], a[14].val[i]);
b[2].val[i] = vaddq_s32(a[2].val[i], a[13].val[i]);
b[3].val[i] = vaddq_s32(a[3].val[i], a[12].val[i]);
b[4].val[i] = vaddq_s32(a[4].val[i], a[11].val[i]);
b[5].val[i] = vaddq_s32(a[5].val[i], a[10].val[i]);
b[6].val[i] = vaddq_s32(a[6].val[i], a[9].val[i]);
b[7].val[i] = vaddq_s32(a[7].val[i], a[8].val[i]);
b[8].val[i] = vsubq_s32(a[7].val[i], a[8].val[i]);
b[9].val[i] = vsubq_s32(a[6].val[i], a[9].val[i]);
b[10].val[i] = vsubq_s32(a[5].val[i], a[10].val[i]);
b[11].val[i] = vsubq_s32(a[4].val[i], a[11].val[i]);
b[12].val[i] = vsubq_s32(a[3].val[i], a[12].val[i]);
b[13].val[i] = vsubq_s32(a[2].val[i], a[13].val[i]);
b[14].val[i] = vsubq_s32(a[1].val[i], a[14].val[i]);
b[15].val[i] = vsubq_s32(a[0].val[i], a[15].val[i]);
}
}
static INLINE int32x4_t diff(uint8x8_t *a /*[16]*/, uint8x8_t *b /*[16]*/, int16x8_t *c /*[16]*/)
{
int i;
int16x8_t abs = vdupq_n_s16(0);
for (i = 0; i < 16; i++)
{
c[i] = vreinterpretq_s16_u16(vsubl_u8(a[i], b[i]));
abs = vaddq_s16(abs, vabsq_s16(c[i]));
}
return vpaddlq_s16(abs);
}
static INLINE void partial_round_shift(int32x4x2_t *a, int16x8_t *b /*[16]*/)
{
WORD32 shift = 13, add;
add = 1 << (shift - 1);
const int32x4_t vecadd = vdupq_n_s32(add);
b[0] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[0].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[0].val[1], vecadd), 13));
b[1] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[1].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[1].val[1], vecadd), 13));
b[2] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[2].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[2].val[1], vecadd), 13));
b[3] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[3].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[3].val[1], vecadd), 13));
b[4] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[4].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[4].val[1], vecadd), 13));
b[5] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[5].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[5].val[1], vecadd), 13));
b[6] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[6].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[6].val[1], vecadd), 13));
b[7] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[7].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[7].val[1], vecadd), 13));
b[8] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[8].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[8].val[1], vecadd), 13));
b[9] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[9].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[9].val[1], vecadd), 13));
b[10] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[10].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[10].val[1], vecadd), 13));
b[11] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[11].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[11].val[1], vecadd), 13));
b[12] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[12].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[12].val[1], vecadd), 13));
b[13] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[13].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[13].val[1], vecadd), 13));
b[14] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[14].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[14].val[1], vecadd), 13));
b[15] = vcombine_s16(
vshrn_n_s32(vaddq_s32(a[15].val[0], vecadd), 13),
vshrn_n_s32(vaddq_s32(a[15].val[1], vecadd), 13));
}
static INLINE int32x4_t
add4(int32x4_t row1_low, int32x4_t row1_high, int32x4_t row2_low, int32x4_t row2_high)
{
int32x4_t sum1, sum2;
sum1 = vaddq_s32(row1_low, row1_high);
sum2 = vaddq_s32(row2_low, row2_high);
return vaddq_s32(sum1, sum2);
}
static INLINE void butterfly_one_coeff_16_32(
int16x8_t a, int16x8_t b, int16_t c, int32x4x2_t *row1, int32x4x2_t *row2)
{
const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
//printf("multiply done\n");
row1->val[0] = vmlal_n_s16(a0, vget_low_s16(b), c);
row1->val[1] = vmlal_n_s16(a1, vget_high_s16(b), c);
row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c);
row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c);
}
static INLINE void butterfly_two_coeff_16_32(
int16x8_t a, int16x8_t b, int16_t c0, int16_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
{
const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
row1->val[0] = vmlal_n_s16(a2, vget_low_s16(b), c0);
row1->val[1] = vmlal_n_s16(a3, vget_high_s16(b), c0);
row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c1);
row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c1);
}
static INLINE void butterfly_one_coeff_32_32(
int32x4x2_t a, int32x4x2_t b, int32_t c, int32x4x2_t *row1, int32x4x2_t *row2)
{
const int32x4_t a0 = vmulq_n_s32(a.val[0], c);
const int32x4_t a1 = vmulq_n_s32(a.val[1], c);
row1->val[0] = vmlaq_n_s32(a0, b.val[0], c);
row1->val[1] = vmlaq_n_s32(a1, b.val[1], c);
row2->val[0] = vmlsq_n_s32(a0, b.val[0], c);
row2->val[1] = vmlsq_n_s32(a1, b.val[1], c);
}
static INLINE void butterfly_two_coeff_32_32(
int32x4x2_t a, int32x4x2_t b, int32_t c0, int32_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
{
const int32x4_t a0 = vmulq_n_s32(a.val[0], c0);
const int32x4_t a1 = vmulq_n_s32(a.val[1], c0);
const int32x4_t a2 = vmulq_n_s32(a.val[0], c1);
const int32x4_t a3 = vmulq_n_s32(a.val[1], c1);
row1->val[0] = vmlaq_n_s32(a2, b.val[0], c0);
row1->val[1] = vmlaq_n_s32(a3, b.val[1], c0);
row2->val[0] = vmlsq_n_s32(a0, b.val[0], c1);
row2->val[1] = vmlsq_n_s32(a1, b.val[1], c1);
}
// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
// are all in-place.
static INLINE void transpose_8x8(int32x4x2_t *a /*[8][2]*/, int32x4x2_t *b)
{
const int32x4x2_t c0 = vtrnq_s32(a[0].val[0], a[1].val[0]);
const int32x4x2_t c1 = vtrnq_s32(a[2].val[0], a[3].val[0]);
const int32x4x2_t c2 = vtrnq_s32(a[4].val[0], a[5].val[0]);
const int32x4x2_t c3 = vtrnq_s32(a[6].val[0], a[7].val[0]);
const int32x4x2_t c4 = vtrnq_s32(a[0].val[1], a[1].val[1]);
const int32x4x2_t c5 = vtrnq_s32(a[2].val[1], a[3].val[1]);
const int32x4x2_t c6 = vtrnq_s32(a[4].val[1], a[5].val[1]);
const int32x4x2_t c7 = vtrnq_s32(a[6].val[1], a[7].val[1]);
const int32x4x2_t d0 = vtrnq_s64_to_s32(c0.val[0], c1.val[0]);
const int32x4x2_t d1 = vtrnq_s64_to_s32(c0.val[1], c1.val[1]);
const int32x4x2_t d2 = vtrnq_s64_to_s32(c2.val[0], c3.val[0]);
const int32x4x2_t d3 = vtrnq_s64_to_s32(c2.val[1], c3.val[1]);
const int32x4x2_t d4 = vtrnq_s64_to_s32(c4.val[0], c5.val[0]);
const int32x4x2_t d5 = vtrnq_s64_to_s32(c4.val[1], c5.val[1]);
const int32x4x2_t d6 = vtrnq_s64_to_s32(c6.val[0], c7.val[0]);
const int32x4x2_t d7 = vtrnq_s64_to_s32(c6.val[1], c7.val[1]);
b[0].val[0] = d0.val[0];
b[0].val[1] = d2.val[0];
b[1].val[0] = d1.val[0];
b[1].val[1] = d3.val[0];
b[2].val[0] = d0.val[1];
b[2].val[1] = d2.val[1];
b[3].val[0] = d1.val[1];
b[3].val[1] = d3.val[1];
b[4].val[0] = d4.val[0];
b[4].val[1] = d6.val[0];
b[5].val[0] = d5.val[0];
b[5].val[1] = d7.val[0];
b[6].val[0] = d4.val[1];
b[6].val[1] = d6.val[1];
b[7].val[0] = d5.val[1];
b[7].val[1] = d7.val[1];
}
static void dct_body_16_32(int16x8_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
{
int16x8_t s[8];
int16x8_t x[4];
int32x4x2_t tmp0, tmp1, tmp2, tmp3;
int32x4x2_t tmp4, tmp5, tmp6, tmp7;
s[0] = vaddq_s16(in[0], in[7]);
s[1] = vaddq_s16(in[1], in[6]);
s[2] = vaddq_s16(in[2], in[5]);
s[3] = vaddq_s16(in[3], in[4]);
s[4] = vsubq_s16(in[3], in[4]);
s[5] = vsubq_s16(in[2], in[5]);
s[6] = vsubq_s16(in[1], in[6]);
s[7] = vsubq_s16(in[0], in[7]);
x[0] = vaddq_s16(s[0], s[3]);
x[1] = vaddq_s16(s[1], s[2]);
x[2] = vsubq_s16(s[1], s[2]);
x[3] = vsubq_s16(s[0], s[3]);
// Type 1
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
butterfly_one_coeff_16_32(x[0], x[1], 64, &out[0], &out[8]);
// out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
butterfly_two_coeff_16_32(x[3], x[2], 36, 83, &out[4], &out[12]);
// Type 2
butterfly_two_coeff_16_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
butterfly_two_coeff_16_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
butterfly_two_coeff_16_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
butterfly_two_coeff_16_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
// Type 3
butterfly_two_coeff_16_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
butterfly_two_coeff_16_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
butterfly_two_coeff_16_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
butterfly_two_coeff_16_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
butterfly_two_coeff_16_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
butterfly_two_coeff_16_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
butterfly_two_coeff_16_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
butterfly_two_coeff_16_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
butterfly_two_coeff_16_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
butterfly_two_coeff_16_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
butterfly_two_coeff_16_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
butterfly_two_coeff_16_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
butterfly_two_coeff_16_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
butterfly_two_coeff_16_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
butterfly_two_coeff_16_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
butterfly_two_coeff_16_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
}
static void dct_body_32_32(int32x4x2_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
{
int32x4x2_t s[8];
int32x4x2_t x[4];
int32x4x2_t tmp0, tmp1, tmp2, tmp3;
int32x4x2_t tmp4, tmp5, tmp6, tmp7;
WORD32 i;
for(i = 0; i < 2; i++)
{
s[0].val[i] = vaddq_s32(in[0].val[i], in[7].val[i]);
s[1].val[i] = vaddq_s32(in[1].val[i], in[6].val[i]);
s[2].val[i] = vaddq_s32(in[2].val[i], in[5].val[i]);
s[3].val[i] = vaddq_s32(in[3].val[i], in[4].val[i]);
s[4].val[i] = vsubq_s32(in[3].val[i], in[4].val[i]);
s[5].val[i] = vsubq_s32(in[2].val[i], in[5].val[i]);
s[6].val[i] = vsubq_s32(in[1].val[i], in[6].val[i]);
s[7].val[i] = vsubq_s32(in[0].val[i], in[7].val[i]);
x[0].val[i] = vaddq_s32(s[0].val[i], s[3].val[i]);
x[1].val[i] = vaddq_s32(s[1].val[i], s[2].val[i]);
x[2].val[i] = vsubq_s32(s[1].val[i], s[2].val[i]);
x[3].val[i] = vsubq_s32(s[0].val[i], s[3].val[i]);
}
// Type 1
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
butterfly_one_coeff_32_32(x[0], x[1], 64, &out[0], &out[8]);
// out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
butterfly_two_coeff_32_32(x[3], x[2], 36, 83, &out[4], &out[12]);
// Type 2
butterfly_two_coeff_32_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
butterfly_two_coeff_32_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
butterfly_two_coeff_32_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
butterfly_two_coeff_32_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
// Type 3
butterfly_two_coeff_32_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
butterfly_two_coeff_32_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
butterfly_two_coeff_32_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
butterfly_two_coeff_32_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
butterfly_two_coeff_32_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
butterfly_two_coeff_32_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
butterfly_two_coeff_32_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
butterfly_two_coeff_32_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
butterfly_two_coeff_32_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
butterfly_two_coeff_32_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
butterfly_two_coeff_32_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
butterfly_two_coeff_32_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
butterfly_two_coeff_32_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
butterfly_two_coeff_32_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
butterfly_two_coeff_32_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
butterfly_two_coeff_32_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
}
/**
*******************************************************************************
*
* @brief
* This function performs residue calculation and forward transform on
* input pixels
*
* @par Description:
* Performs residue calculation by subtracting source and prediction and
* followed by forward transform
*
* @param[in] pu1_src
* Input 16x16 pixels
*
* @param[in] pu1_pred
* Prediction data
*
* @param[in] pi2_tmp
* Temporary buffer of size 16x16
*
* @param[out] pi2_dst
* Output 16x16 coefficients
*
* @param[in] src_strd
* Input stride
*
* @param[in] pred_strd
* Prediction Stride
*
* @param[in] dst_strd_chr_flag
* Output Stride and Chroma Flag packed in the MS and LS 16-bit
*
* @returns Void
*
* @remarks
* None
*
*******************************************************************************
*/
UWORD32 ihevc_resi_trans_16x16_neon(
UWORD8 *pu1_src,
UWORD8 *pu1_pred,
WORD32 *pi4_temp,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 pred_strd,
WORD32 dst_strd_chr_flag)
{
UWORD32 u4_blk_sad = 0;
WORD32 chroma_flag;
WORD32 dst_strd;
uint8x8_t temp0[16], temp1[16];
int16x8_t temp2[16], temp3[16];
int32x4_t tmp_a, tmp_b;
int64x2_t tmp_c;
int32x2_t sad_v;
int32x4x2_t out0[16], out1[16], temp4[16], temp5[16];
(void)pi4_temp;
chroma_flag = dst_strd_chr_flag & 1;
dst_strd = dst_strd_chr_flag >> 16;
/* Residue + Forward Transform 1st stage */
// Left half.
load(pu1_src, src_strd, temp0, chroma_flag);
load(pu1_pred, pred_strd, temp1, chroma_flag);
tmp_a = diff(temp0, temp1, temp2);
cross_input_16(temp2, temp3);
dct_body_16_32(temp3, out0);
// Right half.
load(pu1_src + 8 * (1 + chroma_flag), src_strd, temp0, chroma_flag);
load(pu1_pred + 8 * (1 + chroma_flag), pred_strd, temp1, chroma_flag);
tmp_b = diff(temp0, temp1, temp2);
cross_input_16(temp2, temp3);
dct_body_16_32(temp3, out1);
tmp_a = vaddq_s32(tmp_a, tmp_b);
tmp_c = vpaddlq_s32(tmp_a);
sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_c)),
vreinterpret_s32_s64(vget_high_s64(tmp_c)));
u4_blk_sad = vget_lane_s32(sad_v, 0);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
transpose_8x8(&out0[0], &temp4[0]);
transpose_8x8(&out1[0], &temp4[8]);
cross_input_32(temp4, temp5);
dct_body_32_32(temp5, temp4);
partial_round_shift(temp4, temp2);
transpose_s16_8x8(
&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(
&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
store(pi2_dst, &temp2[0]);
store(pi2_dst + 8, &temp2[8]);
pi2_dst += 8 * dst_strd;
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
transpose_8x8(&out0[8], &out1[0]);
transpose_s32_8x8(
&out1[8], &out1[9], &out1[10], &out1[11], &out1[12], &out1[13], &out1[14], &out1[15]);
cross_input_32(out1, temp5);
dct_body_32_32(temp5, temp4);
partial_round_shift(temp4, temp2);
transpose_s16_8x8(
&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(
&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
store(pi2_dst, &temp2[0]);
store(pi2_dst + 8, &temp2[8]);
return u4_blk_sad;
}