/****************************************************************************** * * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /** ******************************************************************************* * @file * ihevc_weighted_pred_atom_intr.c * * @brief * Contains function definitions for weighted prediction used in inter * prediction * * @author * * * @par List of Functions: * - ihevc_weighted_pred_uni_ssse3() * - ihevc_weighted_pred_bi_ssse3() * - ihevc_weighted_pred_bi_default_ssse3() * - ihevc_weighted_pred_chroma_uni_ssse3() * - ihevc_weighted_pred_chroma_bi_ssse3() * - ihevc_weighted_pred_chroma_bi_default_ssse3() * * @remarks * None * ******************************************************************************* */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ #include <stdio.h> #include <assert.h> #include "ihevc_debug.h" #include "ihevc_typedefs.h" #include "ihevc_macros.h" #include "ihevc_platform_macros.h" #include "ihevc_func_selector.h" #include "ihevc_defs.h" #include "ihevc_weighted_pred.h" #include "ihevc_inter_pred.h" #include <immintrin.h> /** ******************************************************************************* * * @brief * Does uni-weighted prediction on the array pointed by pi2_src and stores * it at the location pointed by pi2_dst * * @par Description: * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + * offset * * @param[in] pi2_src * Pointer to the source * * @param[out] pu1_dst * Pointer to the destination * * @param[in] src_strd * Source stride * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to the source * * @param[in] off0 * offset to be added after rounding and * * @param[in] shifting * * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 wgt0, WORD32 off0, WORD32 shift, WORD32 lvl_shift, WORD32 ht, WORD32 wd) { WORD32 row, col, temp; /* all 128 bit registers are named with a suffix mxnb, where m is the */ /* number of n bits packed in the register */ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b; ASSERT(wd % 4 == 0); /* checking assumption*/ ASSERT(ht % 4 == 0); /* checking assumption*/ temp = 1 << (shift - 1); // seting values in register lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); wgt0_8x16b = _mm_set1_epi16(wgt0); /* lvl_shift * wgt0 */ res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); const_temp_4x32b = _mm_set1_epi32(temp); off0_4x32b = _mm_set1_epi32(off0); /* lvl_shift * wgt0 */ lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); /* lvl_shift * wgt0 + 1 << (shift - 1) */ lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); if(0 == (wd & 7)) /* wd multiple of 8 case */ { __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wd; col += 8) { /* for row =0 ,1,2,3*/ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); /* row = 1 */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); /* row = 2 */ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); /* row = 3 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); /* Get 32 bit Result */ res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); /* (i4_tmp >> shift) */ /* First 4 pixels */ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /* (i4_tmp >> shift) */ /* Last 4 pixels */ res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b); res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp); */ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b); res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/ /* To update pointer */ pi2_src += 8; pu1_dst += 8; } /* inner loop ends here(4-output values in single iteration) */ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ } } else /* wd multiple of 4 case */ { WORD32 dst0, dst1, dst2, dst3; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wd; col += 4) { /* for row =0 ,1,2,3*/ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); /* row = 1 */ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); /* row = 2 */ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd)); /* row = 3 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd)); /* 2 rows together */ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b); src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); /* Get 32 bit Result */ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); /* (i4_tmp >> shift) */ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp); */ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b); dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); /* dst row = 1 to 3 */ res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2); res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); dst2 = _mm_cvtsi128_si32(res_temp2_4x32b); dst3 = _mm_cvtsi128_si32(res_temp3_4x32b); /* row = 1 to row = 3 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* To update pointer */ pi2_src += 4; pu1_dst += 4; } /* inner loop ends here(4-output values in single iteration) */ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ } } } /** ******************************************************************************* * * @brief * Does chroma uni-weighted prediction on array pointed by pi2_src and stores * it at the location pointed by pi2_dst * * @par Description: * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + * offset * * @param[in] pi2_src * Pointer to the source * * @param[out] pu1_dst * Pointer to the destination * * @param[in] src_strd * Source stride * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to the source * * @param[in] off0 * offset to be added after rounding and * * @param[in] shifting * * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source (each colour component) * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 wgt0_cb, WORD32 wgt0_cr, WORD32 off0_cb, WORD32 off0_cr, WORD32 shift, WORD32 lvl_shift, WORD32 ht, WORD32 wd) { WORD32 row, col, temp, wdx2; /* all 128 bit registers are named with a suffix mxnb, where m is the */ /* number of n bits packed in the register */ __m128i src_temp0_8x16b, src_temp1_8x16b; __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; __m128i res_temp0_4x32b, res_temp1_4x32b; ASSERT(wd % 2 == 0); /* checking assumption*/ ASSERT(ht % 2 == 0); /* checking assumption*/ temp = 1 << (shift - 1); wdx2 = 2 * wd; // seting values in register lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); /* lvl_shift * wgt0 */ res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); const_temp_4x32b = _mm_set1_epi32(temp); off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb); /* lvl_shift * wgt0 */ lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); /* lvl_shift * wgt0 + 1 << (shift - 1) */ lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); { if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ { __m128i src_temp2_8x16b, src_temp3_8x16b; __m128i res_temp2_4x32b, res_temp3_4x32b; __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 16) { /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); /* row = 1 */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); /* row = 0 */ /* Next 8 pixels */ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8)); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp4_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); res_temp5_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); /* Get 32 bit Result */ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b); res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b); res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b); res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b); /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); /* (i4_tmp >> shift) */ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); /* (i4_tmp >> shift) */ res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b); res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp); */ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b); res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b); /* store 16 8-bit output values */ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ pi2_src += 16; /* Pointer update */ pu1_dst += 16; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ { __m128i res_temp2_4x32b, res_temp3_4x32b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 8) { /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); /* row = 1 */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); /* Get 32 bit Result */ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); /* (i4_tmp >> shift) */ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp); */ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ pi2_src += 8; /* Pointer update */ pu1_dst += 8; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } else /* 2*wd multiple of 4 case */ { WORD32 dst0, dst1; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 4) { /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); /* row = 1 */ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); /* 2 rows together */ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); /* Get 32 bit Result */ res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); /* (i4_tmp >> shift) */ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); /*i4_tmp = (i4_tmp >> shift) + off0; */ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp); */ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); /* dst row = 1 to 3 */ res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); /* row = 1 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; pi2_src += 4; /* Pointer update */ pu1_dst += 4; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } } } /** ******************************************************************************* * * @brief * Does bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst * * @par Description: * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + * off1 + 1) << (shift - 1) ) >> shift * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to source 1 * * @param[in] off0 * offset 0 * * @param[in] wgt1 * weight to be multiplied to source 2 * * @param[in] off1 * offset 1 * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 wgt0, WORD32 off0, WORD32 wgt1, WORD32 off1, WORD32 shift, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col, temp; __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; #include <assert.h> ASSERT(wd % 4 == 0); /* checking assumption*/ ASSERT(ht % 4 == 0); /* checking assumption*/ temp = (off0 + off1 + 1) << (shift - 1); // seting values in register lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); wgt0_8x16b = _mm_set1_epi16(wgt0); lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); wgt1_8x16b = _mm_set1_epi16(wgt1); /* lvl_shift1 * wgt0 */ res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); /* lvl_shift2 * wgt1 */ res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); const_temp_4x32b = _mm_set1_epi32(temp); /* lvl_shift1 * wgt0 */ lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); /* lvl_shift2 * wgt1 */ lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); if(0 == (wd & 7)) /* wd multiple of 8 case */ { __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wd; col += 8) { /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); /* Get 32 bit Result */ res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); /* (pi2_src[col] + lvl_shift) * wgt */ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); /* (i4_tmp >> shift) */ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /* Next 4 Pixels */ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ pi2_src1 += 8; /* Pointer update */ pi2_src2 += 8; /* Pointer update */ pu1_dst += 8; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ } /* outer loop ends */ } else /* wd multiple of 4 case */ { WORD32 dst0, dst1; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wd; col += 4) { /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ /* 2 rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); /* Get 32 bit Result */ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); /* (pi2_src[col] + lvl_shift) * wgt */ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); /* (i4_tmp >> shift) */ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); /* dst row = 1 to 3 */ res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); /* row = 1 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; pi2_src1 += 4; /* Pointer update */ pi2_src2 += 4; /* Pointer update */ pu1_dst += 4; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ } /* outer loop ends */ } } /** ******************************************************************************* * * @brief * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst * * @par Description: * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + * off1 + 1) << (shift - 1) ) >> shift * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] wgt0 * weight to be multiplied to source 1 * * @param[in] off0 * offset 0 * * @param[in] wgt1 * weight to be multiplied to source 2 * * @param[in] off1 * offset 1 * * @param[in] shift * (14 Bit depth) + log2_weight_denominator * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source (each colour component) * * @returns * * @remarks * None * ******************************************************************************* */ void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 wgt0_cb, WORD32 wgt0_cr, WORD32 off0_cb, WORD32 off0_cr, WORD32 wgt1_cb, WORD32 wgt1_cr, WORD32 off1_cb, WORD32 off1_cr, WORD32 shift, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col, temp1, temp2; WORD32 wdx2; __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; ASSERT(wd % 2 == 0); /* checking assumption*/ ASSERT(ht % 2 == 0); /* checking assumption*/ temp1 = (off0_cb + off1_cb + 1) << (shift - 1); temp2 = (off0_cr + off1_cr + 1) << (shift - 1); // seting values in register lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb); /* lvl_shift1 * wgt0 */ res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); /* lvl_shift2 * wgt1 */ res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1); wdx2 = wd * 2; /* lvl_shift1 * wgt0 */ lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); /* lvl_shift2 * wgt1 */ lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */ { __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 8) { /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); /* Get 32 bit Result */ res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); /* (pi2_src[col] + lvl_shift) * wgt */ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); /* (i4_tmp >> shift) */ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); /* Next 4 Pixels */ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ pi2_src1 += 8; /* Pointer update */ pi2_src2 += 8; /* Pointer update */ pu1_dst += 8; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } /* outer loop ends */ } else /* wdx2 multiple of 4 case */ { WORD32 dst0, dst1; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 4) { /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ /* 2 rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); /* Get 32 bit Result */ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); /* (pi2_src[col] + lvl_shift) * wgt */ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); /* (i4_tmp >> shift) */ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); /* dst row = 1 to 3 */ res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); /* row = 1 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; pi2_src1 += 4; /* Pointer update */ pi2_src2 += 4; /* Pointer update */ pu1_dst += 4; /* Pointer update */ } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } } /** ******************************************************************************* * * @brief * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst * * @par Description: * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) * >> shift where shift = 15 - BitDepth * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source * * @returns * * @remarks * None * * Assumption : ht%4 == 0, wd%4 == 0 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, * final result will match even if intermediate precision is in 16 bit. * ******************************************************************************* */ void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { { WORD32 row, col, temp; WORD32 shift; __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b; __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; ASSERT(wd % 4 == 0); /* checking assumption*/ ASSERT(ht % 2 == 0); /* checking assumption*/ shift = SHIFT_14_MINUS_BIT_DEPTH + 1; temp = 1 << (shift - 1); // seting values in register lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1); lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2); const_temp_8x16b = _mm_set1_epi16(temp); lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b); lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b); if(0 == (ht & 3)) /* ht multiple of 4*/ { if(0 == (wd & 15)) /* wd multiple of 16 case */ { __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wd; col += 16) { /*load 8 pixel values */ /* First 8 Values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); /*load 8 pixel values */ /* Second 8 Values */ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); /* row = 1 */ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); /* row = 2 */ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); /*load 8 pixel values */ /* Second 8 Values */ /* row = 3 */ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); /* (i4_tmp >> shift) */ /* First 8 Values */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ /* Second 8 Values */ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b); src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b); /* store four 8-bit output values */ /* 16 8 Values */ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ /* To update pointer */ pi2_src1 += 16; pi2_src2 += 16; pu1_dst += 16; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ } } else if(0 == (wd & 7)) /* multiple of 8 case */ { /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wd; col += 8) { /*load 8 pixel values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ /* To update pointer */ pi2_src1 += 8; pi2_src2 += 8; pu1_dst += 8; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ } } else /* wd multiple of 4 case*/ { WORD32 dst0, dst1, dst2, dst3; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wd; col += 4) { /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); /* Pack two rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); /* dst row = 1 to 3 */ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); /* row = 1 to row = 3 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* To update pointer */ pi2_src1 += 4; pi2_src2 += 4; pu1_dst += 4; } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ } } } else /* ht multiple of 2 case and wd multiple of 4 case*/ { WORD32 dst0, dst1; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wd; col += 4) { /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); /* Pack two rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); /* dst row = 1 to 3 */ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); /* row = 1 to row = 3 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* To update pointer */ pi2_src1 += 4; pi2_src2 += 4; pu1_dst += 4; } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ } } } } /** ******************************************************************************* * * @brief * Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and * pi2_src2 and stores it at location pointed by pi2_dst * * @par Description: * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) * >> shift where shift = 15 - BitDepth * * @param[in] pi2_src1 * Pointer to source 1 * * @param[in] pi2_src2 * Pointer to source 2 * * @param[out] pu1_dst * Pointer to destination * * @param[in] src_strd1 * Source stride 1 * * @param[in] src_strd2 * Source stride 2 * * @param[in] dst_strd * Destination stride * * @param[in] lvl_shift1 * added before shift and offset * * @param[in] lvl_shift2 * added before shift and offset * * @param[in] ht * height of the source * * @param[in] wd * width of the source (each colour component) * * @returns * * @remarks * None * * Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0. * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, * final result will match even if intermediate precision is in 16 bit. ******************************************************************************* */ void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1, WORD16 *pi2_src2, UWORD8 *pu1_dst, WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, WORD32 lvl_shift1, WORD32 lvl_shift2, WORD32 ht, WORD32 wd) { WORD32 row, col, temp; WORD32 shift, wdx2; __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; __m128i lvl_shift1_8x16b; __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; ASSERT(wd % 2 == 0); /* checking assumption*/ ASSERT(ht % 2 == 0); /* checking assumption*/ UNUSED(lvl_shift1); UNUSED(lvl_shift2); shift = SHIFT_14_MINUS_BIT_DEPTH + 1; temp = 1 << (shift - 1); wdx2 = wd * 2; // seting values in register lvl_shift1_8x16b = _mm_set1_epi16(temp); if(0 == (ht & 3)) /* ht multiple of 4 case */ { if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ { __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wdx2; col += 16) { /*load 8 pixel values */ /* First 8 Values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); /*load 8 pixel values */ /* Second 8 Values */ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); /* row = 1 */ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); /* row = 2 */ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); /*load 8 pixel values */ /* Second 8 Values */ /* row = 3 */ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); /* (i4_tmp >> shift) */ /* First 8 Values */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); /* (i4_tmp >> shift) */ /* Second 8 Values */ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); /* store four 8-bit output values */ /* First 8 Values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b); src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b); /* store four 8-bit output values */ /* Second 8 Values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/ /* To update pointer */ pi2_src1 += 16; pi2_src2 += 16; pu1_dst += 16; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ } } else if(0 == (wdx2 & 7)) /* multiple of 8 case */ { /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wdx2; col += 8) { /*load 8 pixel values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ /* To update pointer */ pi2_src1 += 8; pi2_src2 += 8; pu1_dst += 8; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ } } else /* 2*wd multiple of 4 case */ { WORD32 dst0, dst1, dst2, dst3; /* outer for loop starts from here */ for(row = 0; row < ht; row += 4) { for(col = 0; col < wdx2; col += 4) { /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); /* row = 2 */ src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); /* row = 3 */ src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); /* Pack two rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); /* dst row = 1 to 3 */ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); /* row = 1 to row = 3 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* To update pointer */ pi2_src1 += 4; pi2_src2 += 4; pu1_dst += 4; } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ } } } else /* ht multiple of 2 case */ { if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ { __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 16) { /*load 8 pixel values */ /* First 8 Values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /*load 8 pixel values */ /* Second 8 Values */ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); /* row = 1 */ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); /* (i4_tmp >> shift) */ /* First 8 Values */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); /* (i4_tmp >> shift) */ /* Second 8 Values */ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); /* store four 8-bit output values */ /* First 8 Values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); /* store four 8-bit output values */ /* Second 8 Values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ /* To update pointer */ pi2_src1 += 16; pi2_src2 += 16; pu1_dst += 16; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } else if(0 == (wdx2 & 7)) /* multiple of 8 case */ { /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 8) { /*load 8 pixel values */ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); /* store four 8-bit output values */ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/ /* To update pointer */ pi2_src1 += 8; pi2_src2 += 8; pu1_dst += 8; } /* inner loop ends here(8-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } else /* 2*wd multiple of 4 case */ { WORD32 dst0, dst1; /* outer for loop starts from here */ for(row = 0; row < ht; row += 2) { for(col = 0; col < wdx2; col += 4) { /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 1 */ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); /* Pack two rows together */ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); /* (pi2_src1[col] + pi2_src2[col]) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); /* (i4_tmp >> shift) */ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); /* dst row = 1 */ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); /* store four 8-bit output values */ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); /* row = 1 */ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* To update pointer */ pi2_src1 += 4; pi2_src2 += 4; pu1_dst += 4; } /* inner loop ends here(4-output values in single iteration) */ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ } } } }