/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_weighted_pred_neon_intr.c
*
* @brief
* Contains function definitions for weighted prediction used in inter
* prediction
*
* @author
* Parthiban V
*
* @par List of Functions:
* - ihevc_weighted_pred_uni()
* - ihevc_weighted_pred_bi()
* - ihevc_weighted_pred_bi_default()
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include "ihevc_typedefs.h"
#include "ihevc_defs.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_inter_pred.h"
#include "arm_neon.h"
/**
*******************************************************************************
*
* @brief
* Does uni-weighted prediction on the array pointed by pi2_src and stores
* it at the location pointed by pi2_dst Assumptions : The function is
* optimized considering the fact Width and height are multiple of 2.
*
* @par Description:
* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
* offset
*
* @param[in] pi2_src
* Pointer to the source
*
* @param[out] pu1_dst
* Pointer to the destination
*
* @param[in] src_strd
* Source stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] wgt0
* weight to be multiplied to the source
*
* @param[in] off0
* offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
* (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 wgt0,
WORD32 off0,
WORD32 shift,
WORD32 lvl_shift,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src_val1;
int16x4_t pi2_src_val2;
int32x4_t i4_tmp1_t;
int32x4_t i4_tmp2_t;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t;
WORD32 tmp_shift = 0 - shift;
int32x4_t tmp_shift_t;
WORD16 *pi2_src_tmp;
UWORD8 *pu1_dst_tmp;
WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
tmp_lvl_shift += (1 << (shift - 1));
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
tmp_shift_t = vmovq_n_s32(tmp_shift);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = wd; col > 0; col -= 4)
{
pi2_src_tmp = pi2_src + src_strd;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
pi2_src += 4;
pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src += 2 * src_strd - wd;
pu1_dst += 2 * dst_strd - wd;
}
}
//WEIGHTED_PRED_UNI
/**
*******************************************************************************
*
* @brief
* Chroma uni-weighted prediction on the array pointed by pi2_src and stores
* it at the location pointed by pi2_dst Assumptions : The function is
* optimized considering the fact Width and height are multiple of 2.
*
* @par Description:
* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
* offset
*
* @param[in] pi2_src
* Pointer to the source
*
* @param[out] pu1_dst
* Pointer to the destination
*
* @param[in] src_strd
* Source stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] wgt0
* weight to be multiplied to the source
*
* @param[in] off0
* offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
* (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 wgt0_cb,
WORD32 wgt0_cr,
WORD32 off0_cb,
WORD32 off0_cr,
WORD32 shift,
WORD32 lvl_shift,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src_val1;
int16x4_t pi2_src_val2;
int32x4_t i4_tmp1_t;
int32x4_t i4_tmp2_t;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
int32x4x2_t tmp_lvl_shift_t;
WORD32 tmp_shift = 0 - shift;
int32x4_t tmp_shift_t;
int16x4_t tmp_wgt0_u, tmp_wgt0_v;
int16x4x2_t wgt0;
WORD16 *pi2_src_tmp;
UWORD8 *pu1_dst_tmp;
WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
tmp_lvl_shift += (1 << (shift - 1));
tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
tmp_lvl_shift += (1 << (shift - 1));
tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
tmp_shift_t = vmovq_n_s32(tmp_shift);
tmp_wgt0_u = vdup_n_s16(wgt0_cb);
tmp_wgt0_v = vdup_n_s16(wgt0_cr);
wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = 2 * wd; col > 0; col -= 4)
{
pi2_src_tmp = pi2_src + src_strd;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
pi2_src += 4;
pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src += 2 * src_strd - 2 * wd;
pu1_dst += 2 * dst_strd - 2 * wd;
}
}
//WEIGHTED_PRED_CHROMA_UNI
/**
*******************************************************************************
*
* @brief
* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and height are multiple
* of 2.
*
* @par Description:
* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
* Pointer to source 1
*
* @param[in] pi2_src2
* Pointer to source 2
*
* @param[out] pu1_dst
* Pointer to destination
*
* @param[in] src_strd1
* Source stride 1
*
* @param[in] src_strd2
* Source stride 2
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] wgt0
* weight to be multiplied to source 1
*
* @param[in] off0
* offset 0
*
* @param[in] wgt1
* weight to be multiplied to source 2
*
* @param[in] off1
* offset 1
*
* @param[in] shift
* (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
* added before shift and offset
*
* @param[in] lvl_shift2
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
WORD16 *pi2_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 wgt0,
WORD32 off0,
WORD32 wgt1,
WORD32 off1,
WORD32 shift,
WORD32 lvl_shift1,
WORD32 lvl_shift2,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src1_val1;
int16x4_t pi2_src1_val2;
int16x4_t pi2_src2_val1;
int16x4_t pi2_src2_val2;
int32x4_t i4_tmp1_t1;
int32x4_t i4_tmp1_t2;
int32x4_t i4_tmp2_t1;
int32x4_t i4_tmp2_t2;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t;
WORD32 tmp_shift = 0 - shift;
int32x4_t tmp_shift_t;
WORD16 *pi2_src_tmp1;
WORD16 *pi2_src_tmp2;
UWORD8 *pu1_dst_tmp;
WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
tmp_shift_t = vmovq_n_s32(tmp_shift);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = wd; col > 0; col -= 4)
{
pi2_src_tmp1 = pi2_src1 + src_strd1;
pi2_src_tmp2 = pi2_src2 + src_strd2;
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
pi2_src1 += 4;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
pi2_src2 += 4;
i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src1 += 2 * src_strd1 - wd;
pi2_src2 += 2 * src_strd2 - wd;
pu1_dst += 2 * dst_strd - wd;
}
}
//WEIGHTED_PRED_BI
/**
*******************************************************************************
*
* @brief
* Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and height are multiple
* of 2.
*
* @par Description:
* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
* Pointer to source 1
*
* @param[in] pi2_src2
* Pointer to source 2
*
* @param[out] pu1_dst
* Pointer to destination
*
* @param[in] src_strd1
* Source stride 1
*
* @param[in] src_strd2
* Source stride 2
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] wgt0
* weight to be multiplied to source 1
*
* @param[in] off0
* offset 0
*
* @param[in] wgt1
* weight to be multiplied to source 2
*
* @param[in] off1
* offset 1
*
* @param[in] shift
* (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
* added before shift and offset
*
* @param[in] lvl_shift2
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
WORD16 *pi2_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 wgt0_cb,
WORD32 wgt0_cr,
WORD32 off0_cb,
WORD32 off0_cr,
WORD32 wgt1_cb,
WORD32 wgt1_cr,
WORD32 off1_cb,
WORD32 off1_cr,
WORD32 shift,
WORD32 lvl_shift1,
WORD32 lvl_shift2,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src1_val1;
int16x4_t pi2_src1_val2;
int16x4_t pi2_src2_val1;
int16x4_t pi2_src2_val2;
int32x4_t i4_tmp1_t1;
int32x4_t i4_tmp1_t2;
int32x4_t i4_tmp2_t1;
int32x4_t i4_tmp2_t2;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
int32x4x2_t tmp_lvl_shift_t;
WORD32 tmp_shift = 0 - shift;
int32x4_t tmp_shift_t;
int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
int16x4x2_t wgt0, wgt1;
WORD16 *pi2_src_tmp1;
WORD16 *pi2_src_tmp2;
UWORD8 *pu1_dst_tmp;
WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
tmp_shift_t = vmovq_n_s32(tmp_shift);
tmp_wgt0_u = vdup_n_s16(wgt0_cb);
tmp_wgt0_v = vdup_n_s16(wgt0_cr);
wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
tmp_wgt1_u = vdup_n_s16(wgt1_cb);
tmp_wgt1_v = vdup_n_s16(wgt1_cr);
wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = 2 * wd; col > 0; col -= 4)
{
pi2_src_tmp1 = pi2_src1 + src_strd1;
pi2_src_tmp2 = pi2_src2 + src_strd2;
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
pi2_src1 += 4;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
pi2_src2 += 4;
i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src1 += 2 * src_strd1 - 2 * wd;
pi2_src2 += 2 * src_strd2 - 2 * wd;
pu1_dst += 2 * dst_strd - 2 * wd;
}
}
//WEIGHTED_PRED_CHROMA_BI
/**
*******************************************************************************
*
* @brief
* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and height are multiple
* of 2.
*
* @par Description:
* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
* >> shift where shift = 15 - BitDepth
*
* @param[in] pi2_src1
* Pointer to source 1
*
* @param[in] pi2_src2
* Pointer to source 2
*
* @param[out] pu1_dst
* Pointer to destination
*
* @param[in] src_strd1
* Source stride 1
*
* @param[in] src_strd2
* Source stride 2
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] lvl_shift1
* added before shift and offset
*
* @param[in] lvl_shift2
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
WORD16 *pi2_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 lvl_shift1,
WORD32 lvl_shift2,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src1_val1;
int16x4_t pi2_src1_val2;
int16x4_t pi2_src2_val1;
int16x4_t pi2_src2_val2;
int32x4_t i4_tmp1_t1;
int32x4_t i4_tmp1_t2;
int32x4_t i4_tmp2_t1;
int32x4_t i4_tmp2_t2;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t;
int32x4_t tmp_shift_t;
WORD16 *pi2_src_tmp1;
WORD16 *pi2_src_tmp2;
UWORD8 *pu1_dst_tmp;
WORD32 shift;
shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
WORD32 tmp_shift = 0 - shift;
WORD32 tmp_lvl_shift = 1 << (shift - 1);
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
tmp_shift_t = vmovq_n_s32(tmp_shift);
int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = wd; col > 0; col -= 4)
{
pi2_src_tmp1 = pi2_src1 + src_strd1;
pi2_src_tmp2 = pi2_src2 + src_strd2;
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
pi2_src1 += 4;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
pi2_src2 += 4;
i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src1 += 2 * src_strd1 - wd;
pi2_src2 += 2 * src_strd2 - wd;
pu1_dst += 2 * dst_strd - wd;
}
}
//WEIGHTED_PRED_BI_DEFAULT
/**
*******************************************************************************
*
* @brief
* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
* function is optimized considering the fact Width and height are multiple
* of 2.
*
* @par Description:
* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
* >> shift where shift = 15 - BitDepth
*
* @param[in] pi2_src1
* Pointer to source 1
*
* @param[in] pi2_src2
* Pointer to source 2
*
* @param[out] pu1_dst
* Pointer to destination
*
* @param[in] src_strd1
* Source stride 1
*
* @param[in] src_strd2
* Source stride 2
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] lvl_shift1
* added before shift and offset
*
* @param[in] lvl_shift2
* added before shift and offset
*
* @param[in] ht
* height of the source
*
* @param[in] wd
* width of the source
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
WORD16 *pi2_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 lvl_shift1,
WORD32 lvl_shift2,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
int16x4_t pi2_src1_val1;
int16x4_t pi2_src1_val2;
int16x4_t pi2_src2_val1;
int16x4_t pi2_src2_val2;
int32x4_t i4_tmp1_t1;
int32x4_t i4_tmp1_t2;
int32x4_t i4_tmp2_t1;
int32x4_t i4_tmp2_t2;
int32x4_t sto_res_tmp1;
uint16x4_t sto_res_tmp2;
uint16x8_t sto_res_tmp3;
uint8x8_t sto_res;
int32x4_t tmp_lvl_shift_t;
int32x4_t tmp_shift_t;
WORD16 *pi2_src_tmp1;
WORD16 *pi2_src_tmp2;
UWORD8 *pu1_dst_tmp;
WORD32 shift;
WORD32 tmp_shift;
WORD32 tmp_lvl_shift;
int16x4_t lvl_shift1_t;
int16x4_t lvl_shift2_t;
shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
tmp_shift = 0 - shift;
tmp_lvl_shift = 1 << (shift - 1);
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
tmp_shift_t = vmovq_n_s32(tmp_shift);
lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
/* height has also been unrolled, hence 2 rows will processed at a time */
/* store also has been taken care for two row process */
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
/* saturated and narrowed */
for(row = ht; row > 0; row -= 2)
{
for(col = 2 * wd; col > 0; col -= 4)
{
pi2_src_tmp1 = pi2_src1 + src_strd1;
pi2_src_tmp2 = pi2_src2 + src_strd2;
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
pi2_src1 += 4;
pu1_dst_tmp = pu1_dst + dst_strd;
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
pi2_src2 += 4;
i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
sto_res = vqmovn_u16(sto_res_tmp3);
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst += 4;
sto_res = vqmovn_u16(sto_res_tmp3);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
}
pi2_src1 += 2 * src_strd1 - 2 * wd;
pi2_src2 += 2 * src_strd2 - 2 * wd;
pu1_dst += 2 * dst_strd - 2 * wd;
}
}
//WEIGHTED_PRED_CHROMA_BI_DEFAULT