/****************************************************************************** * * Copyright (C) 2015 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ih264_resi_trans_quant_sse42.c * * @brief * Contains function definitions single stage forward transform for H.264 * It will calculate the residue, do the cf and then do quantization * * @author * Mohit [100664] * * @par List of Functions: * - ih264_resi_trans_quant_4x4_sse42() * - ih264_resi_trans_quant_chroma_4x4_sse42() * * @remarks * None * ******************************************************************************* */ /* System include files */ #include <stddef.h> /* User include files */ #include "ih264_typedefs.h" #include "ih264_defs.h" #include "ih264_size_defs.h" #include "ih264_macros.h" #include "ih264_trans_macros.h" #include "ih264_trans_data.h" #include "ih264_structs.h" #include "ih264_trans_quant_itrans_iquant.h" #include <immintrin.h> /** ******************************************************************************* * * @brief * This function performs forward transform and quantization on a 4*4 block * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] src_strd * Source stride * * @param[in] pred_strd * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * ******************************************************************************* */ void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr) { WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; WORD32 mask0, mask1; __m128i sum0, sum1, sum2, cmp0, cmp1; __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); __m128i temp_2 = _mm_set1_epi16(2); __m128i temp_1 = _mm_set1_epi16(1); __m128i src_r0, src_r1, src_r2, src_r3; __m128i pred_r0, pred_r1, pred_r2, pred_r3; __m128i temp0, temp1, temp2, temp3; __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero __m128i sign_reg0, sign_reg2; __m128i scalemat_r0_r1, scalemat_r2_r3; UNUSED (pu2_threshold_matrix); scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits src_r0 = _mm_cvtepu8_epi16(src_r0); src_r1 = _mm_cvtepu8_epi16(src_r1); src_r2 = _mm_cvtepu8_epi16(src_r2); src_r3 = _mm_cvtepu8_epi16(src_r3); pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits src_r0 = _mm_sub_epi16(src_r0, pred_r0); src_r1 = _mm_sub_epi16(src_r1, pred_r1); src_r2 = _mm_sub_epi16(src_r2, pred_r2); src_r3 = _mm_sub_epi16(src_r3, pred_r3); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ temp0 = _mm_add_epi16(src_r0, src_r3); /* x1 = z1 + z2 */ temp1 = _mm_add_epi16(src_r1, src_r2); /* x2 = z1 - z2 */ temp2 = _mm_sub_epi16(src_r1, src_r2); /* x3 = z0 - z3 */ temp3 = _mm_sub_epi16(src_r0, src_r3); /* z0 = x0 + x1 */ src_r0 = _mm_add_epi16(temp0, temp1); /* z1 = (x3 << 1) + x2 */ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) src_r1 = _mm_add_epi16(src_r1, temp2); /* z2 = x0 - x1 */ src_r2 = _mm_sub_epi16(temp0, temp1); /* z3 = x3 - (x2 << 1) */ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) src_r3 = _mm_sub_epi16(temp3, src_r3); // Matrix transpose /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ temp0 = _mm_add_epi16(src_r0, src_r3); /* x1 = z1 + z2 */ temp1 = _mm_add_epi16(src_r1, src_r2); /* x2 = z1 - z2 */ temp2 = _mm_sub_epi16(src_r1, src_r2); /* x3 = z0 - z3 */ temp3 = _mm_sub_epi16(src_r0, src_r3); /* z0 = x0 + x1 */ src_r0 = _mm_add_epi16(temp0, temp1); /* z1 = (x3 << 1) + x2 */ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) src_r1 = _mm_add_epi16(src_r1, temp2); /* z2 = x0 - x1 */ src_r2 = _mm_sub_epi16(temp0, temp1); /* z3 = x3 - (x2 << 1) */ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) src_r3 = _mm_sub_epi16(temp3, src_r3); tmp_dc = _mm_extract_epi16(src_r0,0); //a0 *pi2_alt_dc_addr = tmp_dc; src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); src_r0 = _mm_abs_epi16(src_r0); src_r2 = _mm_abs_epi16(src_r2); src_r1 = _mm_srli_si128(src_r0, 8); src_r0 = _mm_cvtepu16_epi32(src_r0); src_r1 = _mm_cvtepu16_epi32(src_r1); src_r3 = _mm_srli_si128(src_r2, 8); src_r2 = _mm_cvtepu16_epi32(src_r2); src_r3 = _mm_cvtepu16_epi32(src_r3); temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); temp0 = _mm_mullo_epi32(temp0, src_r0); temp1 = _mm_mullo_epi32(temp1, src_r1); temp2 = _mm_mullo_epi32(temp2, src_r2); temp3 = _mm_mullo_epi32(temp3, src_r3); temp0 = _mm_add_epi32(temp0,rnd_fact); temp1 = _mm_add_epi32(temp1,rnd_fact); temp2 = _mm_add_epi32(temp2,rnd_fact); temp3 = _mm_add_epi32(temp3,rnd_fact); temp0 = _mm_srli_epi32(temp0,u4_qbits); temp1 = _mm_srli_epi32(temp1,u4_qbits); temp2 = _mm_srli_epi32(temp2,u4_qbits); temp3 = _mm_srli_epi32(temp3,u4_qbits); temp0 = _mm_packs_epi32 (temp0,temp1); temp2 = _mm_packs_epi32 (temp2,temp3); temp0 = _mm_sign_epi16(temp0, sign_reg0); temp2 = _mm_sign_epi16(temp2, sign_reg2); _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); mask0 = _mm_movemask_epi8(cmp0); mask1 = _mm_movemask_epi8(cmp1); u4_zero_coeff = 0; if(mask0) { if(mask0 == 0xffff) u4_zero_coeff+=8; else { cmp0 = _mm_and_si128(temp_1, cmp0); sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } if(mask1) { if(mask1 == 0xffff) u4_zero_coeff+=8; else { cmp1 = _mm_and_si128(temp_1, cmp1); sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } /* Return total nonzero coefficients in the current sub block */ u4_nonzero_coeff = 16 - u4_zero_coeff; *pu1_nnz = u4_nonzero_coeff; } /** ******************************************************************************* * * @brief * This function performs forward transform and quantization on a 4*4 chroma block * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] src_strd * Source stride * * @param[in] pred_strd * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * ******************************************************************************* */ void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out, WORD32 src_strd,WORD32 pred_strd, const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,UWORD32 u4_round_factor, UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr) { WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; WORD32 mask0, mask1; __m128i cmp0, cmp1, sum0, sum1, sum2; __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); __m128i temp_2 = _mm_set1_epi16(2); __m128i temp_1 = _mm_set1_epi16(1); __m128i src_r0, src_r1, src_r2, src_r3; __m128i pred_r0, pred_r1, pred_r2, pred_r3; __m128i temp0, temp1, temp2, temp3; __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero __m128i sign_reg0, sign_reg2; __m128i scalemat_r0_r1, scalemat_r2_r3; __m128i chroma_mask = _mm_set1_epi16 (0xFF); UNUSED (pu2_threshold_matrix); scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits src_r0 = _mm_and_si128(src_r0, chroma_mask); src_r1 = _mm_and_si128(src_r1, chroma_mask); src_r2 = _mm_and_si128(src_r2, chroma_mask); src_r3 = _mm_and_si128(src_r3, chroma_mask); // src_r0 = _mm_cvtepu8_epi16(src_r0); // src_r1 = _mm_cvtepu8_epi16(src_r1); // src_r2 = _mm_cvtepu8_epi16(src_r2); // src_r3 = _mm_cvtepu8_epi16(src_r3); pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits pred_r0 = _mm_and_si128(pred_r0, chroma_mask); pred_r1 = _mm_and_si128(pred_r1, chroma_mask); pred_r2 = _mm_and_si128(pred_r2, chroma_mask); pred_r3 = _mm_and_si128(pred_r3, chroma_mask); // pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits // pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits // pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits // pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits src_r0 = _mm_sub_epi16(src_r0, pred_r0); src_r1 = _mm_sub_epi16(src_r1, pred_r1); src_r2 = _mm_sub_epi16(src_r2, pred_r2); src_r3 = _mm_sub_epi16(src_r3, pred_r3); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ temp0 = _mm_add_epi16(src_r0, src_r3); /* x1 = z1 + z2 */ temp1 = _mm_add_epi16(src_r1, src_r2); /* x2 = z1 - z2 */ temp2 = _mm_sub_epi16(src_r1, src_r2); /* x3 = z0 - z3 */ temp3 = _mm_sub_epi16(src_r0, src_r3); /* z0 = x0 + x1 */ src_r0 = _mm_add_epi16(temp0, temp1); /* z1 = (x3 << 1) + x2 */ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) src_r1 = _mm_add_epi16(src_r1, temp2); /* z2 = x0 - x1 */ src_r2 = _mm_sub_epi16(temp0, temp1); /* z3 = x3 - (x2 << 1) */ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) src_r3 = _mm_sub_epi16(temp3, src_r3); // Matrix transpose /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ temp0 = _mm_add_epi16(src_r0, src_r3); /* x1 = z1 + z2 */ temp1 = _mm_add_epi16(src_r1, src_r2); /* x2 = z1 - z2 */ temp2 = _mm_sub_epi16(src_r1, src_r2); /* x3 = z0 - z3 */ temp3 = _mm_sub_epi16(src_r0, src_r3); /* z0 = x0 + x1 */ src_r0 = _mm_add_epi16(temp0, temp1); /* z1 = (x3 << 1) + x2 */ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) src_r1 = _mm_add_epi16(src_r1, temp2); /* z2 = x0 - x1 */ src_r2 = _mm_sub_epi16(temp0, temp1); /* z3 = x3 - (x2 << 1) */ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) src_r3 = _mm_sub_epi16(temp3, src_r3); tmp_dc = _mm_extract_epi16(src_r0,0); //a0 *pi2_alt_dc_addr = tmp_dc; src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); src_r0 = _mm_abs_epi16(src_r0); src_r2 = _mm_abs_epi16(src_r2); src_r1 = _mm_srli_si128(src_r0, 8); src_r0 = _mm_cvtepu16_epi32(src_r0); src_r1 = _mm_cvtepu16_epi32(src_r1); src_r3 = _mm_srli_si128(src_r2, 8); src_r2 = _mm_cvtepu16_epi32(src_r2); src_r3 = _mm_cvtepu16_epi32(src_r3); temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); temp0 = _mm_mullo_epi32(temp0, src_r0); temp1 = _mm_mullo_epi32(temp1, src_r1); temp2 = _mm_mullo_epi32(temp2, src_r2); temp3 = _mm_mullo_epi32(temp3, src_r3); temp0 = _mm_add_epi32(temp0,rnd_fact); temp1 = _mm_add_epi32(temp1,rnd_fact); temp2 = _mm_add_epi32(temp2,rnd_fact); temp3 = _mm_add_epi32(temp3,rnd_fact); temp0 = _mm_srli_epi32(temp0,u4_qbits); temp1 = _mm_srli_epi32(temp1,u4_qbits); temp2 = _mm_srli_epi32(temp2,u4_qbits); temp3 = _mm_srli_epi32(temp3,u4_qbits); temp0 = _mm_packs_epi32 (temp0,temp1); temp2 = _mm_packs_epi32 (temp2,temp3); temp0 = _mm_sign_epi16(temp0, sign_reg0); temp2 = _mm_sign_epi16(temp2, sign_reg2); //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0); _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); mask0 = _mm_movemask_epi8(cmp0); mask1 = _mm_movemask_epi8(cmp1); u4_zero_coeff = 0; if(mask0) { if(mask0 == 0xffff) u4_zero_coeff+=8; else { cmp0 = _mm_and_si128(temp_1, cmp0); sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } if(mask1) { if(mask1 == 0xffff) u4_zero_coeff+=8; else { cmp1 = _mm_and_si128(temp_1, cmp1); sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } /* Return total nonzero coefficients in the current sub block */ u4_nonzero_coeff = 16 - u4_zero_coeff; *pu1_nnz = u4_nonzero_coeff; } /** ******************************************************************************* * * @brief * This function performs forward hadamard transform and quantization on a 4*4 block * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] src_strd * Source stride * * @param[in] pred_strd * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * */ void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, UWORD32 u4_round_factor,UWORD8 *pu1_nnz ) { WORD32 u4_zero_coeff,u4_nonzero_coeff=0; __m128i cmp0, cmp1, sum0, sum1, sum2; WORD32 mask0, mask1; __m128i src_r0_r1, src_r2_r3, sign_reg; __m128i src_r0, src_r1, src_r2, src_r3; __m128i zero_8x16b = _mm_setzero_si128(); __m128i temp0, temp1, temp2, temp3; __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3; __m128i temp_1 = _mm_set1_epi16(1); __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); UNUSED (pu2_threshold_matrix); src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3 /* Perform Inverse transform */ /*-------------------------------------------------------------*/ /* Forward DC transform [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); /*-------------------------------------------------------------*/ /* Forward DC transform [ Vertical transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); src_r0 = _mm_srai_epi32(src_r0, 1); src_r1 = _mm_srai_epi32(src_r1, 1); src_r2 = _mm_srai_epi32(src_r2, 1); src_r3 = _mm_srai_epi32(src_r3, 1); // Quantization sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1); sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2); sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3); sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3); sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively sign_reg2 = _mm_slli_epi16(sign_reg2, 1); sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); src_r0 = _mm_abs_epi32(src_r0); //Absolute values src_r1 = _mm_abs_epi32(src_r1); src_r2 = _mm_abs_epi32(src_r2); src_r3 = _mm_abs_epi32(src_r3); temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0] temp1 = _mm_mullo_epi32(scale_val, src_r1); temp2 = _mm_mullo_epi32(scale_val, src_r2); temp3 = _mm_mullo_epi32(scale_val, src_r3); temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor temp1 = _mm_add_epi32(temp1,rnd_fact); temp2 = _mm_add_epi32(temp2,rnd_fact); temp3 = _mm_add_epi32(temp3,rnd_fact); temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works temp1 = _mm_srli_epi32(temp1,u4_qbits); temp2 = _mm_srli_epi32(temp2,u4_qbits); temp3 = _mm_srli_epi32(temp3,u4_qbits); temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. temp2 = _mm_packs_epi32 (temp2,temp3); temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration temp2 = _mm_sign_epi16(temp2, sign_reg2); _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2); cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); mask0 = _mm_movemask_epi8(cmp0); mask1 = _mm_movemask_epi8(cmp1); u4_zero_coeff = 0; if(mask0) { if(mask0 == 0xffff) u4_zero_coeff+=8; else { cmp0 = _mm_and_si128(temp_1, cmp0); sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } if(mask1) { if(mask1 == 0xffff) u4_zero_coeff+=8; else { cmp1 = _mm_and_si128(temp_1, cmp1); sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); sum2 = _mm_hadd_epi16(sum1, zero_8x16b); u4_zero_coeff += _mm_cvtsi128_si32(sum2); } } /* Return total nonzero coefficients in the current sub block */ u4_nonzero_coeff = 16 - u4_zero_coeff; pu1_nnz[0] = u4_nonzero_coeff; } /** ******************************************************************************* * * @brief * This function performs forward hadamard transform and quantization on a 2*2 block * for both U and V planes * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] src_strd * Source stride * * @param[in] pred_strd * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * NNZ for dc is populated at 0 and 5th position of pu1_nnz * */ void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, UWORD32 u4_round_factor,UWORD8 *pu1_nnz) { WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0; __m128i cmp, cmp0, cmp1; __m128i sum0, sum1; WORD32 mask, mask0, mask1; __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; __m128i zero_8x16b = _mm_setzero_si128(); __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); __m128i sign_reg0, sign_reg1; __m128i temp_1 = _mm_set1_epi16(1); __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); UNUSED (pu2_threshold_matrix); src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 // Quantization sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1); sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively plane_0 = _mm_abs_epi32(plane_0); //Absolute values plane_1 = _mm_abs_epi32(plane_1); temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0] temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0] temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor temp1 = _mm_add_epi32(temp1,rnd_fact); temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works temp1 = _mm_srli_epi32(temp1,u4_qbits); temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); cmp = _mm_cmpeq_epi16(temp0, zero_8x16b); mask = _mm_movemask_epi8(cmp); mask0 = mask & 0xff; mask1 = mask>>8; if(mask0) { if(mask0 == 0xff) nonzero_coeff_0 += 4; else { cmp0 = _mm_and_si128(temp_1, cmp); sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); val = _mm_cvtsi128_si32(sum1); val = val & 0xffff; nonzero_coeff_0 += val; } } if(mask1) { if(mask1 == 0xff) nonzero_coeff_1 += 4; else { cmp1 = _mm_srli_si128(cmp, 8); cmp1 = _mm_and_si128(temp_1, cmp1); sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); sum1 = _mm_hadd_epi16(sum0, zero_8x16b); nonzero_coeff_1 += _mm_cvtsi128_si32(sum1); } } pu1_nnz[0] = 4 - nonzero_coeff_0; pu1_nnz[1] = 4 - nonzero_coeff_1; }