/****************************************************************************** * * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /** ******************************************************************************* * @file * ihevc_16x16_itrans_recon_x86_intr.c * * @brief * Contains function definitions for inverse * transform and reconstruction for 16x16. * * @author * 100470 * 100592 (edited by) * * @par List of Functions: * - ihevc_itrans_recon_16x16_sse42() * * @remarks * None * ******************************************************************************* */ #include <stdio.h> #include <string.h> #include "ihevc_typedefs.h" #include "ihevc_macros.h" #include "ihevc_platform_macros.h" #include "ihevc_defs.h" #include "ihevc_trans_tables.h" #include "ihevc_itrans_recon.h" #include "ihevc_func_selector.h" #include "ihevc_trans_macros.h" #include <immintrin.h> #include <emmintrin.h> #include <smmintrin.h> #include <tmmintrin.h> /** ******************************************************************************* * * @brief * This function performs inverse quantization, inverse transform and * reconstruction for 16x16 input block * * @par Description: * Performs inverse quantization , inverse transform and adds the * prediction data and clips output to 8 bit * * @param[in] pi2_src * Input 16x16 coefficients * * @param[in] pi2_tmp * Temporary 16x16 buffer for storing inverse * transform 1st stage output * * @param[in] pu1_pred * Prediction 16x16 block * * @param[in] pi2_dequant_coeff * Dequant Coeffs * * @param[out] pu1_dst * Output 16x16 block * * @param[in] qp_div * Quantization parameter / 6 * * @param[in] qp_rem * Quantization parameter % 6 * * @param[in] src_strd * Input stride * * @param[in] pred_strd * Prediction stride * * @param[in] dst_strd * Output Stride * * @param[in] zero_cols * Zero columns in pi2_src * * @returns Void * * @remarks * None * ******************************************************************************* */ void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src, WORD16 *pi2_tmp, UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 pred_strd, WORD32 dst_strd, WORD32 zero_cols, WORD32 zero_rows) { __m128i m_temp_reg_0; __m128i m_temp_reg_1; __m128i m_temp_reg_10; __m128i m_temp_reg_11; __m128i m_temp_reg_12; __m128i m_temp_reg_13; __m128i m_temp_reg_14; __m128i m_temp_reg_20; __m128i m_temp_reg_21; __m128i m_temp_reg_22; __m128i m_temp_reg_23; __m128i m_temp_reg_24; __m128i m_temp_reg_25; __m128i m_temp_reg_26; __m128i m_temp_reg_27; __m128i m_temp_reg_30; __m128i m_temp_reg_31; __m128i m_temp_reg_32; __m128i m_temp_reg_33; __m128i m_temp_reg_34; __m128i m_temp_reg_35; __m128i m_temp_reg_36; __m128i m_temp_reg_37; __m128i m_temp_reg_40; __m128i m_temp_reg_41; __m128i m_temp_reg_42; __m128i m_temp_reg_43; __m128i m_temp_reg_44; __m128i m_temp_reg_45; __m128i m_temp_reg_46; __m128i m_temp_reg_47; __m128i m_temp_reg_70; __m128i m_temp_reg_71; __m128i m_temp_reg_72; __m128i m_temp_reg_73; __m128i m_temp_reg_74; __m128i m_temp_reg_75; __m128i m_temp_reg_76; __m128i m_temp_reg_77; __m128i m_rdng_factor; __m128i m_count; __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; WORD32 i; WORD32 zero_last8_cols_stg1; WORD32 zero_last8_rows_stg1; WORD32 zero_last12_rows_stg1; WORD32 zero_last12_rows_stg2; WORD32 zero_last8_rows_stg2; WORD32 loop = 0; WORD32 i4_shift = IT_SHIFT_STAGE_1; WORD32 trans_size = TRANS_SIZE_16; /* Following 3 instructions replicates the value in the */ /* lower 16 bits of m_add_iq in the entire register */ /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0; zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0; zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0; zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0; zero_last8_rows_stg2 = zero_last8_cols_stg1; if(zero_last8_cols_stg1) { loop = 1; } else loop = 2; /* i = 0 => lower 8 samples */ /* i = 1 => higher 8 samples */ for(i = 0; i < loop; i++) { { WORD32 sample_half_index = i << 3; WORD16 *pi2_tmp_src = pi2_src + sample_half_index; WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); /* If last 12 rows are zero : Rishab */ if(zero_last12_rows_stg1) { /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_temp_reg_26 = m_temp_reg_24; m_temp_reg_27 = m_temp_reg_25; } /* eo */ /* eo0[0-3] */ { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ /* e[0][0-3] stored in pi2_tmp[0][0-7] */ /* e[7][0-3] stored in pi2_tmp[0][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ /* e[0][4-7] stored in pi2_tmp[1][0-7] */ /* e[7][4-7] stored in pi2_tmp[1][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ /* e[1][0-3] stored in pi2_tmp[2][0-7] */ /* e[6][0-3] stored in pi2_tmp[2][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ /* e[1][4-7] stored in pi2_tmp[3][0-7] */ /* e[6][4-7] stored in pi2_tmp[3][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* e[2][0-3] stored in pi2_tmp[4][0-7] */ /* e[5][0-3] stored in pi2_tmp[4][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); /* e[2][4-7] stored in pi2_tmp[5][0-7] */ /* e[5][4-7] stored in pi2_tmp[5][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* e[3][0-3] stored in pi2_tmp[6][0-7] */ /* e[4][0-3] stored in pi2_tmp[6][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); /* e[3][4-7] stored in pi2_tmp[7][0-7] */ /* e[4][4-7] stored in pi2_tmp[7][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } } /* If last 8 rows are zero : Rishab */ else if(zero_last8_rows_stg1) { /* eeo */ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ { m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); } /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); m_temp_reg_26 = m_temp_reg_24; m_temp_reg_27 = m_temp_reg_25; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 } /* eo0[0-3] */ { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); /* e[0][0-3] stored in pi2_tmp[0][0-7] */ /* e[7][0-3] stored in pi2_tmp[0][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); /* e[0][4-7] stored in pi2_tmp[1][0-7] */ /* e[7][4-7] stored in pi2_tmp[1][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); /* e[1][0-3] stored in pi2_tmp[2][0-7] */ /* e[6][0-3] stored in pi2_tmp[2][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); /* e[1][4-7] stored in pi2_tmp[3][0-7] */ /* e[6][4-7] stored in pi2_tmp[3][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* e[2][0-3] stored in pi2_tmp[4][0-7] */ /* e[5][0-3] stored in pi2_tmp[4][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); /* e[2][4-7] stored in pi2_tmp[5][0-7] */ /* e[5][4-7] stored in pi2_tmp[5][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* e[3][0-3] stored in pi2_tmp[6][0-7] */ /* e[4][0-3] stored in pi2_tmp[6][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); /* e[3][4-7] stored in pi2_tmp[7][0-7] */ /* e[4][4-7] stored in pi2_tmp[7][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } } /* If all the rows are non-zero : Rishab */ else { /* eeo */ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ { m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); } /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 } /* eo0[0-3] */ { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); /* e[0][0-3] stored in pi2_tmp[0][0-7] */ /* e[7][0-3] stored in pi2_tmp[0][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); /* e[0][4-7] stored in pi2_tmp[1][0-7] */ /* e[7][4-7] stored in pi2_tmp[1][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); /* e[1][0-3] stored in pi2_tmp[2][0-7] */ /* e[6][0-3] stored in pi2_tmp[2][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); /* e[1][4-7] stored in pi2_tmp[3][0-7] */ /* e[6][4-7] stored in pi2_tmp[3][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); /* e[2][0-3] stored in pi2_tmp[4][0-7] */ /* e[5][0-3] stored in pi2_tmp[4][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); /* e[2][4-7] stored in pi2_tmp[5][0-7] */ /* e[5][4-7] stored in pi2_tmp[5][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); /* e[3][0-3] stored in pi2_tmp[6][0-7] */ /* e[4][0-3] stored in pi2_tmp[6][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); /* e[3][4-7] stored in pi2_tmp[7][0-7] */ /* e[4][4-7] stored in pi2_tmp[7][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += 8; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += 8; } } } { WORD32 sample_half_index = i << 3; WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd; m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); pi2_tmp_src += (src_strd << 1); } /* o & stage 1 out */ { WORD32 j; WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; WORD32 out_stride = (trans_size << 1); WORD32 in_stride = trans_size << 1; if(zero_last12_rows_stg1) { for(j = 0; j < 2; j++) { if(j) //H8B= higher 8 bytes L8B lower 8 bytes { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } } } else if(zero_last8_rows_stg1) { for(j = 0; j < 2; j++) { if(j) { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } } } else { for(j = 0; j < 2; j++) { if(j) //H8B= higher 8 bytes L8B lower 8 bytes { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch -= out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += 8; } } } } /* Transpose */ { WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp); WORD32 out_stride = (trans_size << 1); WORD32 in_stride = (trans_size << 1); WORD32 j; for(j = 0; j < 2; j++) { m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a pi2_src_scratch += in_stride; m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c pi2_src_scratch += in_stride; m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e pi2_src_scratch += in_stride; m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g pi2_src_scratch += 8; m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i pi2_src_scratch -= in_stride; m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k pi2_src_scratch -= in_stride; m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m pi2_src_scratch -= in_stride; m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o pi2_src_scratch += 8; m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45); pi2_dst_scratch += 8; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47); pi2_dst_scratch += 8; } } } if(zero_last8_cols_stg1) { WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size); WORD32 out_stride = (trans_size << 1); WORD32 j; m_temp_reg_40 = _mm_setzero_si128(); for(j = 0; j < 2; j++) { _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += 8; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch -= out_stride; _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); pi2_dst_scratch += 8; } } /* Stage 2 */ for(i = 0; i < 2; i++) { //__m128i m_temp_reg_15,m_temp_reg_16; WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp); WORD32 stride = (trans_size); WORD16 temp_array[256]; i4_shift = IT_SHIFT_STAGE_2; if(zero_last12_rows_stg2) { /* eeo */ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ { m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 pi2_src_temp += (stride * 9); if(!i) { pi2_src_temp += (stride * 6 + 8); } else { pi2_src_temp += (stride * 2 + 8); } pi2_src_temp -= (stride * 9); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 m_temp_reg_20 = _mm_setzero_si128(); m_temp_reg_22 = _mm_setzero_si128(); m_temp_reg_21 = _mm_setzero_si128(); m_temp_reg_23 = _mm_setzero_si128(); } /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ /* Loading coeff and src for use in next block */ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70); m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 m_temp_reg_26 = m_temp_reg_24; m_temp_reg_27 = m_temp_reg_25; /* */ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20); m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20); } /* eo */ { WORD16 *pi2_scratch = temp_array; WORD32 out_stride = 8; /* eo0[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ /* e[0][0-3] stored in pu1_dst[0] */ /* e[7][0-3] stored in pu1_dst[1] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); pi2_scratch += out_stride; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ /* e[0][4-7] stored in pu1_dst[2] */ /* e[7][4-7] stored in pu1_dst[3] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ /* e[1][0-3] stored in pu1_dst[4] */ /* e[6][0-3] stored in pu1_dst[5] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ /* e[1][4-7] stored in pu1_dst[6]*/ /* e[6][4-7] stored in pu1_dst[7] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* e[2][0-3] stored in pu1_dst[8]*/ /* e[5][0-3] stored in pu1_dst[9] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); /* e[2][4-7] stored in pu1_dst[10]*/ /* e[5][4-7] stored in pu1_dst[11] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* e[3][0-3] stored in pu1_dst[12]*/ /* e[4][0-3] stored in pu1_dst[13] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); /* e[3][4-7] stored in pu1_dst[14]*/ /* e[4][4-7] stored in pu1_dst[15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } } } else if(zero_last8_rows_stg2) { /* eeo */ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ { m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 pi2_src_temp += (stride); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 pi2_src_temp += (stride * 8); if(!i) { pi2_src_temp += (stride * 6 + 8); } else { pi2_src_temp += (stride * 2 + 8); } pi2_src_temp -= (stride * 8); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 pi2_src_temp -= (stride); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 m_temp_reg_76 = _mm_setzero_si128(); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); } /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70); m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_temp_reg_26 = m_temp_reg_24; m_temp_reg_27 = m_temp_reg_25; m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); } /* eo */ { WORD16 *pi2_scratch = temp_array; WORD32 out_stride = 8; /* eo0[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); /* e[0][0-3] stored in pu1_dst[0] */ /* e[7][0-3] stored in pu1_dst[1] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); pi2_scratch += out_stride; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); /* e[0][4-7] stored in pu1_dst[2] */ /* e[7][4-7] stored in pu1_dst[3] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); /* e[1][0-3] stored in pu1_dst[4] */ /* e[6][0-3] stored in pu1_dst[5] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); /* e[1][4-7] stored in pu1_dst[6]*/ /* e[6][4-7] stored in pu1_dst[7] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); /* e[2][0-3] stored in pu1_dst[8]*/ /* e[5][0-3] stored in pu1_dst[9] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); /* e[2][4-7] stored in pu1_dst[10]*/ /* e[5][4-7] stored in pu1_dst[11] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); /* e[3][0-3] stored in pu1_dst[12]*/ /* e[4][0-3] stored in pu1_dst[13] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); /* e[3][4-7] stored in pu1_dst[14]*/ /* e[4][4-7] stored in pu1_dst[15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } } } else { /* eeo */ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ { m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 pi2_src_temp += (stride); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 pi2_src_temp += (stride * 7); m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8 pi2_src_temp += (stride); m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12 if(!i) { pi2_src_temp += (stride * 6 + 8); } else { pi2_src_temp += (stride * 2 + 8); } m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14 pi2_src_temp -= (stride); m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10 pi2_src_temp -= (stride * 7); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 pi2_src_temp -= (stride); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); } /* eee */ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ { /* Loading coeff and src for use in next block */ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 } /* eo */ { WORD16 *pi2_scratch = temp_array; WORD32 out_stride = 8; /* eo0[0-3] */ { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); /* e[0][0-3] stored in pi2_tmp[0][0-7] */ /* e[7][0-3] stored in pi2_tmp[0][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo0[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); /* e[0][4-7] stored in pi2_tmp[1][0-7] */ /* e[7][4-7] stored in pi2_tmp[1][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 } /* eo1[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); /* e[1][0-3] stored in pi2_tmp[2][0-7] */ /* e[6][0-3] stored in pi2_tmp[2][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo1[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); /* e[1][4-7] stored in pi2_tmp[3][0-7] */ /* e[6][4-7] stored in pi2_tmp[3][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 } /* eo2[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); /* e[2][0-3] stored in pi2_tmp[4][0-7] */ /* e[5][0-3] stored in pi2_tmp[4][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo2[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); /* e[2][4-7] stored in pi2_tmp[5][0-7] */ /* e[5][4-7] stored in pi2_tmp[5][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 } /* eo3[0-3] */ { m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); /* e[3][0-3] stored in pi2_tmp[6][0-7] */ /* e[4][0-3] stored in pi2_tmp[6][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } /* eo3[4-7] */ { m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); /* e[3][4-7] stored in pi2_tmp[7][0-7] */ /* e[4][4-7] stored in pi2_tmp[7][8-15] */ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); pi2_scratch += out_stride; _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); pi2_scratch += out_stride; } } } if(zero_last12_rows_stg2) { /* o & stage 2 pre-transposed out */ { WORD32 j; WORD16 *pi2_src_scratch = temp_array; WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); WORD32 out_stride = (trans_size); WORD32 in_stride = (8) * 4; pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 pi2_src_temp += (stride * 9); if(0 == i) { pi2_src_temp -= (stride * 2 - 8); } else { pi2_src_temp -= (stride * 6 - 8); } pi2_src_temp -= (stride * 9); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 for(j = 0; j < 2; j++) { if(j) { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } } } } else if(zero_last8_rows_stg2) { /* o & stage 2 pre-transposed out */ { WORD32 j; WORD16 *pi2_src_scratch = temp_array; WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); WORD32 out_stride = (trans_size); WORD32 in_stride = (8) * 4; pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 pi2_src_temp += (stride); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 pi2_src_temp += (stride * 8); if(0 == i) { pi2_src_temp -= (stride * 2 - 8); } else { pi2_src_temp -= (stride * 6 - 8); } pi2_src_temp -= (stride * 8); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 pi2_src_temp -= (stride); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 for(j = 0; j < 2; j++) { if(j) { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } } } } else { /* o & stage 2 pre-transposed out */ { WORD32 j; WORD16 *pi2_src_scratch = temp_array; WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); WORD32 out_stride = (trans_size); WORD32 in_stride = (8) * 4; pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 pi2_src_temp += (stride); m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 pi2_src_temp += (stride * 7); m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9 pi2_src_temp += (stride); m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13 if(0 == i) { pi2_src_temp -= (stride * 2 - 8); } else { pi2_src_temp -= (stride * 6 - 8); } m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15 pi2_src_temp -= (stride); m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11 pi2_src_temp -= (stride * 7); m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 pi2_src_temp -= (stride); m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 for(j = 0; j < 2; j++) { if(j) //H8B= higher 8 bytes L8B lower 8 bytes { m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B } else { m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B } m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 /* o0[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); m_count = _mm_cvtsi32_si128(i4_shift); m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o1[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o2[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o3[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } /* o4[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o5[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += ((!i) * out_stride + 8); } /* o6[0-3] */ { m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch -= in_stride; m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += out_stride; } /* o7[0-3] */ { m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); pi2_src_scratch += 8; m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); pi2_dst_scratch += (i * out_stride + 8); } } } } } /* Transpose */ { WORD16 *pi2_src_scratch; UWORD8 *pu1_pred_temp = pu1_pred; WORD32 out_stride = dst_strd; WORD32 in_stride = trans_size; WORD32 j; m_temp_reg_1 = _mm_setzero_si128(); for(i = 0; i < 2; i++) { pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp; for(j = 0; j < 2; j++) { m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a pi2_src_scratch += in_stride; m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c pi2_src_scratch += ((!i) * in_stride + 8); m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e pi2_src_scratch += (in_stride); m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g pi2_src_scratch += (i * in_stride + 8); m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i pi2_src_scratch += in_stride; m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k pi2_src_scratch += ((!i) * in_stride + 8); m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m pi2_src_scratch += in_stride; m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o pi2_src_scratch += (i * in_stride + 8); m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0); m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12); m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); pu1_dst += out_stride; pu1_pred_temp += pred_strd; m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0); m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12); m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45); _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); pu1_dst += out_stride; pu1_pred_temp += pred_strd; m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0); m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12); m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46); _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); pu1_dst += out_stride; pu1_pred_temp += pred_strd; m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0); m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12); m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47); _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); pu1_dst += out_stride; pu1_pred_temp += pred_strd; } } } }