C++程序  |  2206行  |  99.74 KB

/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/**
 *******************************************************************************
 * @file
 *  impeg2_itrans_recon_x86_intr.c
 *
 * @brief
 *  Contains function definitions for inverse  quantization, inverse
 * transform and reconstruction
 *
 * @author
 *  100470
 *  100592 (edited by)
 *
 * @par List of Functions:
 *  - impeg2_itrans_recon_8x8_sse42()
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
#include <stdio.h>
#include <string.h>
#include "iv_datatypedef.h"
#include "impeg2_macros.h"
#include "impeg2_defs.h"
#include "impeg2_globals.h"

#include <immintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>


/**
 *******************************************************************************
 *
 * @brief
 *  This function performs inverse quantization, inverse  transform and
 * reconstruction for 8c8 input block
 *
 * @par Description:
 *  Performs inverse quantization , inverse transform  and adds the
 * prediction data and clips output to 8 bit
 *
 * @param[in] pi2_src
 *  Input 8x8 coefficients
 *
 * @param[in] pi2_tmp
 *  Temporary 8x8 buffer for storing inverse
 *  transform 1st stage output
 *
 * @param[in] pu1_pred
 *  Prediction 8x8 block
 *
 * @param[in] pi2_dequant_coeff
 *  Dequant Coeffs
 *
 * @param[out] pu1_dst
 *  Output 8x8 block
 *
 * @param[in] src_strd
 *  Input stride
 *
 * @param[in] qp_div
 *  Quantization parameter / 6
 *
 * @param[in] qp_rem
 *  Quantization parameter % 6
 *
 * @param[in] pred_strd
 *  Prediction stride
 *
 * @param[in] dst_strd
 *  Output Stride
 *
 * @param[in] zero_cols
 *  Zero columns in pi2_src
 *
 * @returns  Void
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */


void impeg2_idct_recon_sse42(WORD16 *pi2_src,
                                  WORD16 *pi2_tmp,
                                  UWORD8 *pu1_pred,
                                  UWORD8 *pu1_dst,
                                  WORD32 src_strd,
                                  WORD32 pred_strd,
                                  WORD32 dst_strd,
                                  WORD32 zero_cols,
                                  WORD32 zero_rows)
{
    __m128i m_temp_reg_0;
    __m128i m_temp_reg_1;
    __m128i m_temp_reg_2;
    __m128i m_temp_reg_3;
    __m128i m_temp_reg_5;
    __m128i m_temp_reg_6;
    __m128i m_temp_reg_7;
    __m128i m_temp_reg_4;
    __m128i m_temp_reg_10;
    __m128i m_temp_reg_11;
    __m128i m_temp_reg_12;
    __m128i m_temp_reg_13;
    __m128i m_temp_reg_14;
    __m128i m_temp_reg_15;
    __m128i m_temp_reg_16;
    __m128i m_temp_reg_17;
    __m128i m_temp_reg_20;
    __m128i m_temp_reg_21;
    __m128i m_temp_reg_22;
    __m128i m_temp_reg_23;
    __m128i m_temp_reg_24;
    __m128i m_temp_reg_25;
    __m128i m_temp_reg_26;
    __m128i m_temp_reg_27;
    __m128i m_temp_reg_30;
    __m128i m_temp_reg_31;
    __m128i m_temp_reg_32;
    __m128i m_temp_reg_33;
    __m128i m_temp_reg_34;
    __m128i m_temp_reg_35;
    __m128i m_temp_reg_36;
    __m128i m_temp_reg_37;
    __m128i m_temp_reg_40;
    __m128i m_temp_reg_41;
    __m128i m_temp_reg_42;
    __m128i m_temp_reg_43;
    __m128i m_temp_reg_44;
    __m128i m_temp_reg_45;
    __m128i m_temp_reg_46;
    __m128i m_temp_reg_47;
    __m128i m_temp_reg_50;
    __m128i m_temp_reg_51;
    __m128i m_temp_reg_52;
    __m128i m_temp_reg_53;
    __m128i m_temp_reg_54;
    __m128i m_temp_reg_55;
    __m128i m_temp_reg_56;
    __m128i m_temp_reg_57;
    __m128i m_temp_reg_60;
    __m128i m_temp_reg_61;
    __m128i m_temp_reg_62;
    __m128i m_temp_reg_63;
    __m128i m_temp_reg_64;
    __m128i m_temp_reg_65;
    __m128i m_temp_reg_66;
    __m128i m_temp_reg_67;
    __m128i m_temp_reg_70;
    __m128i m_temp_reg_71;
    __m128i m_temp_reg_72;
    __m128i m_temp_reg_73;
    __m128i m_temp_reg_74;
    __m128i m_temp_reg_75;
    __m128i m_temp_reg_76;
    __m128i m_temp_reg_77;
    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;

    WORD32 check_row_stage_1;   /* Lokesh */
    WORD32 check_row_stage_2;   /* Lokesh */

    __m128i m_rdng_factor;
    WORD32 i4_shift = IDCT_STG1_SHIFT;
    UNUSED(pi2_tmp);
    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;

    m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;

    m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
    pi2_src += src_strd;
    m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);

    if(!check_row_stage_2)
    {
        if(!check_row_stage_1)
        {
            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                //Interleaving 0,4 row in 0 , 1 Rishab
                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);

            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
            {

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36

                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
                //Interleaving 2,6 row in 4, 5 Rishab
                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);

                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);


                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */

                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);



                /* e */

                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

            }

            /* o */
            {

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {

                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
                    //o0:1B*89+3B*75,5B*50+7B*18
                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);

                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);



                    /* Column 0 of destination computed here */
                    /* It is stored in m_temp_reg_50 */
                    /* Column 7 of destination computed here */
                    /* It is stored in m_temp_reg_57 */
                    /* Upper 8 bytes of both registers are zero due to zero_cols*/



                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_setzero_si128();
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o1:1B*75-3B*18,5B*89+7B*50
                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);

                    /* Loading coeff for computing o2  in the next block */

                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);

                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */



                    /* Column 1 of destination computed here */
                    /* It is stored in m_temp_reg_51 */
                    /* Column 6 of destination computed here */
                    /* It is stored in m_temp_reg_56 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o2:1B*50-3B*89,5B*18+7B*75
                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */

                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);



                    /* Column 2 of destination computed here */
                    /* It is stored in m_temp_reg_52 */
                    /* Column 5 of destination computed here */
                    /* It is stored in m_temp_reg_55 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o3:1B*18-3B*50,5B*75-7B*89
                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);

                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */



                    /* Column 3 of destination computed here */
                    /* It is stored in m_temp_reg_53 */
                    /* Column 4 of destination computed here */
                    /* It is stored in m_temp_reg_54 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);


                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {
                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);

                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);

                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_54 = _mm_setzero_si128();
                m_temp_reg_55 = _mm_setzero_si128();
                m_temp_reg_56 = _mm_setzero_si128();
                m_temp_reg_57 = _mm_setzero_si128();
            }
        }
        else
        {
            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                //Interleaving 0,4 row in 0 , 1 Rishab
                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);

            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
            {

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36

                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
                //Interleaving 2,6 row in 4, 5 Rishab
                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);

                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);


                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */

                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);



                /* e */

                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

            }

            /* o */
            {

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {

                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
                    //o0:1B*89+3B*75,5B*50+7B*18
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);

                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);

                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);



                    /* Column 0 of destination computed here */
                    /* It is stored in m_temp_reg_50 */
                    /* Column 7 of destination computed here */
                    /* It is stored in m_temp_reg_57 */
                    /* Upper 8 bytes of both registers are zero due to zero_cols*/



                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_setzero_si128();
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o1:1B*75-3B*18,5B*89+7B*50
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);

                    /* Loading coeff for computing o2  in the next block */

                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);

                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);



                    /* Column 1 of destination computed here */
                    /* It is stored in m_temp_reg_51 */
                    /* Column 6 of destination computed here */
                    /* It is stored in m_temp_reg_56 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o2:1B*50-3B*89,5B*18+7B*75
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */

                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);

                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);


                    /* Column 2 of destination computed here */
                    /* It is stored in m_temp_reg_52 */
                    /* Column 5 of destination computed here */
                    /* It is stored in m_temp_reg_55 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);

                    //o3:1B*18-3B*50,5B*75-7B*89
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);

                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */

                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);


                    /* Column 3 of destination computed here */
                    /* It is stored in m_temp_reg_53 */
                    /* Column 4 of destination computed here */
                    /* It is stored in m_temp_reg_54 */

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);


                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {
                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);

                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_54 = _mm_setzero_si128();
                m_temp_reg_55 = _mm_setzero_si128();
                m_temp_reg_56 = _mm_setzero_si128();
                m_temp_reg_57 = _mm_setzero_si128();
            }
        }

        /* Stage 2 */
        i4_shift = IDCT_STG2_SHIFT;
        {
            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);


                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            {

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);


                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);

                /* Loading coeff for computing o0 in the next block */
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);


                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);



                /* e */

                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);

                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);

            }

            /* o */
            {

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {
                    //o0:1B*89+3B*75,1T*89+3T*75
                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);

                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
                    /* Loading coeff for computing o1 in the next block */
                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);



                    /* Column 0 of destination computed here */
                    /* It is stored in m_temp_reg_50 */
                    /* Column 7 of destination computed here */
                    /* It is stored in m_temp_reg_57 */

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    //o1:1B*75-3B*18,1T*75-3T*18
                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);


                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */


                    /* Loading coeff for computing o2  in the next block */
                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);



                    /* Column 1 of destination computed here */
                    /* It is stored in m_temp_reg_51 */
                    /* Column 6 of destination computed here */
                    /* It is stored in m_temp_reg_56 */

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    //o2:1B*50-3B*89,5T*18+7T*75.
                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */

                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);


                    /* Column 2 of destination computed here */
                    /* It is stored in m_temp_reg_52 */
                    /* Column 5 of destination computed here */
                    /* It is stored in m_temp_reg_55 */

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    //o3:1B*18-3B*50,1T*18-3T*50
                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);


                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */


                    /* Column 3 of destination computed here */
                    /* It is stored in m_temp_reg_53 */
                    /* Column 4 of destination computed here */
                    /* It is stored in m_temp_reg_54 */

                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);

                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);

                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);

                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {
                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
            }

            /* Recon and store */
            {
                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);

                m_temp_reg_50 = _mm_setzero_si128();
                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);

                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);

                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);

                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
                pu1_dst += dst_strd;
            }
        }
    }
    else

    {

        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
        if(!check_row_stage_1)
        {
            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                //Interleaving 0,4 row in 0 , 1 Rishab
                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);


                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            {

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36

                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
                //Interleaving 2,6 row in 4, 5 Rishab
                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);

                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);

                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);



                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */

                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
                //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
                //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);

            }

            /* e */
            {
                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);

                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);

            }

            /* o */
            {

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {

                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
                    //o0:1B*89+3B*75,1T*89+3T*75
                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);

                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);

                }

                /* Column 0 of destination computed here */
                /* It is stored in m_temp_reg_50 */
                /* Column 7 of destination computed here */
                /* It is stored in m_temp_reg_57 */
                {


                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);

                    /* Loading coeff for computing o2  in the next block */

                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);

                }

                /* Column 1 of destination computed here */
                /* It is stored in m_temp_reg_51 */
                /* Column 6 of destination computed here */
                /* It is stored in m_temp_reg_56 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o2:1B*50-3B*89,1T*50-3T*89
                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */


                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);

                }

                /* Column 2 of destination computed here */
                /* It is stored in m_temp_reg_52 */
                /* Column 5 of destination computed here */
                /* It is stored in m_temp_reg_55 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o3:1B*18-3B*50,1T*18-3T*50
                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);

                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */


                }

                /* Column 3 of destination computed here */
                /* It is stored in m_temp_reg_53 */
                /* Column 4 of destination computed here */
                /* It is stored in m_temp_reg_54 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {


                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);

                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
            }
        }
        else
        {

            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                //Interleaving 0,4 row in 0 , 1 Rishab
                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);


                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            {

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36

                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
                //Interleaving 2,6 row in 4, 5 Rishab
                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);

                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);

                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);



                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */

                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);

                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);

            }

            /* e */
            {
                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);

                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);

            }

            /* o */
            {

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {

                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);


                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);

                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
                }

                /* Column 0 of destination computed here */
                /* It is stored in m_temp_reg_50 */
                /* Column 7 of destination computed here */
                /* It is stored in m_temp_reg_57 */
                {


                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);

                    /* Loading coeff for computing o2  in the next block */

                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);

                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
                }

                /* Column 1 of destination computed here */
                /* It is stored in m_temp_reg_51 */
                /* Column 6 of destination computed here */
                /* It is stored in m_temp_reg_56 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */


                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);

                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
                }

                /* Column 2 of destination computed here */
                /* It is stored in m_temp_reg_52 */
                /* Column 5 of destination computed here */
                /* It is stored in m_temp_reg_55 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);

                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */


                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
                }

                /* Column 3 of destination computed here */
                /* It is stored in m_temp_reg_53 */
                /* Column 4 of destination computed here */
                /* It is stored in m_temp_reg_54 */
                {
                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);

                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);

                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);

                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {


                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);

                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
            }
        }
        /* Stage 2 */

        i4_shift = IDCT_STG2_SHIFT;

        {

            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
            {
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub

                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);

                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);


                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
            }


            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
            {
                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);


                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);

                /* Loading coeff for computing o0 in the next block */
                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);


                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
            }

            /* e */
            {
                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);

                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);

                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);

                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);

            }

            /* o */
            {
                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);

                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
                {
                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);

                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
                    /* Loading coeff for computing o1 in the next block */
                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);

                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
                }

                /* Column 0 of destination computed here */
                /* It is stored in m_temp_reg_50 */
                /* Column 7 of destination computed here */
                /* It is stored in m_temp_reg_57 */
                {
                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);

                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);

                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);


                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */


                    /* Loading coeff for computing o2  in the next block */
                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);

                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
                }

                /* Column 1 of destination computed here */
                /* It is stored in m_temp_reg_51 */
                /* Column 6 of destination computed here */
                /* It is stored in m_temp_reg_56 */
                {
                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);

                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);

                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);


                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */

                    /* Loading coeff for computing o3  in the next block */

                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);

                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
                }

                /* Column 2 of destination computed here */
                /* It is stored in m_temp_reg_52 */
                /* Column 5 of destination computed here */
                /* It is stored in m_temp_reg_55 */
                {
                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);

                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);

                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);

                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);

                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);

                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);



                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */


                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
                }

                /* Column 3 of destination computed here */
                /* It is stored in m_temp_reg_53 */
                /* Column 4 of destination computed here */
                /* It is stored in m_temp_reg_54 */
                {
                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);

                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);

                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);

                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);

                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
                }
            }

            /* Transpose of the destination 8x8 matrix done here */
            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
            /* respectively */
            {
                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);

                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);

                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
            }

            /* Recon and store */
            {
                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
                pu1_pred += pred_strd;
                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);


                m_temp_reg_50 = _mm_setzero_si128();
                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);

                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);

                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);

                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
                pu1_dst += dst_strd;
                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
                pu1_dst += dst_strd;

            }


        }


    }
}

void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
                            WORD16 *pi2_tmp,
                            UWORD8 *pu1_pred,
                            UWORD8 *pu1_dst,
                            WORD32 src_strd,
                            WORD32 pred_strd,
                            WORD32 dst_strd,
                            WORD32 zero_cols,
                            WORD32 zero_rows)
{
    WORD32 val;
    __m128i value_4x32b, mismatch_stg2_additive;
    __m128i pred_r, pred_half0, pred_half1;
    __m128i temp0, temp1;
    __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);

    UNUSED(pi2_tmp);
    UNUSED(src_strd);
    UNUSED(zero_cols);
    UNUSED(zero_rows);

    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
    val *= gai2_impeg2_idct_q11[0];
    value_4x32b = _mm_set1_epi32(val);

    // Row 0 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
    pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)pu1_dst, temp0);

    // Row 1 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);

    // Row 2 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);

    // Row 3 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);

    // Row 4 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);

    // Row 5 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);

    // Row 6 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);

    // Row 7 processing
    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
    pred_r =  _mm_cvtepu8_epi16(pred_r);
    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
    pred_half0 = _mm_cvtepu16_epi32(pred_r);
    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);

    pred_r = _mm_srli_si128(pred_r, 8);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp0 = _mm_add_epi32(temp0, round_stg2);
    temp1 = _mm_add_epi32(temp1, round_stg2);
    pred_half1 = _mm_cvtepu16_epi32(pred_r);
    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
    temp0 = _mm_add_epi32(temp0, pred_half0);
    temp1 = _mm_add_epi32(temp1, pred_half1);

    temp0 = _mm_packus_epi32(temp0, temp1);
    temp0 = _mm_packus_epi16(temp0, temp1);

    _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
}

void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
                            WORD16 *pi2_tmp,
                            UWORD8 *pu1_pred,
                            UWORD8 *pu1_dst,
                            WORD32 src_strd,
                            WORD32 pred_strd,
                            WORD32 dst_strd,
                            WORD32 zero_cols,
                            WORD32 zero_rows)
{
    WORD32 val;
    __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;

    UNUSED(pi2_tmp);
    UNUSED(src_strd);
    UNUSED(zero_cols);
    UNUSED(zero_rows);

    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
    val = val * gai2_impeg2_idct_q11[0];
    val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);

    value_4x32b = _mm_set1_epi32(val);

    //Row 0-1 processing
    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);

    temp0 = _mm_cvtepu16_epi32(pred_r0);
    pred_r0 = _mm_srli_si128(pred_r0, 8);
    temp2 = _mm_cvtepu16_epi32(pred_r1);
    pred_r1 = _mm_srli_si128(pred_r1, 8);
    temp1 = _mm_cvtepu16_epi32(pred_r0);
    temp3 = _mm_cvtepu16_epi32(pred_r1);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp2 = _mm_add_epi32(temp2, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp3 = _mm_add_epi32(temp3, value_4x32b);
    temp0 = _mm_packus_epi32(temp0, temp1);
    temp2 = _mm_packus_epi32(temp2, temp3);
    temp0 = _mm_packus_epi16(temp0, temp1);
    temp2 = _mm_packus_epi16(temp2, temp3);
    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);

    //Row 2-3 processing
    pu1_pred += 2 * pred_strd;
    pu1_dst += 2 * dst_strd;

    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);

    temp0 = _mm_cvtepu16_epi32(pred_r0);
    pred_r0 = _mm_srli_si128(pred_r0, 8);
    temp2 = _mm_cvtepu16_epi32(pred_r1);
    pred_r1 = _mm_srli_si128(pred_r1, 8);
    temp1 = _mm_cvtepu16_epi32(pred_r0);
    temp3 = _mm_cvtepu16_epi32(pred_r1);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp2 = _mm_add_epi32(temp2, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp3 = _mm_add_epi32(temp3, value_4x32b);
    temp0 = _mm_packus_epi32(temp0, temp1);
    temp2 = _mm_packus_epi32(temp2, temp3);
    temp0 = _mm_packus_epi16(temp0, temp1);
    temp2 = _mm_packus_epi16(temp2, temp3);
    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);

    //Row 4-5 processing
    pu1_pred += 2 * pred_strd;
    pu1_dst += 2 * dst_strd;

    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);

    temp0 = _mm_cvtepu16_epi32(pred_r0);
    pred_r0 = _mm_srli_si128(pred_r0, 8);
    temp2 = _mm_cvtepu16_epi32(pred_r1);
    pred_r1 = _mm_srli_si128(pred_r1, 8);
    temp1 = _mm_cvtepu16_epi32(pred_r0);
    temp3 = _mm_cvtepu16_epi32(pred_r1);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp2 = _mm_add_epi32(temp2, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp3 = _mm_add_epi32(temp3, value_4x32b);
    temp0 = _mm_packus_epi32(temp0, temp1);
    temp2 = _mm_packus_epi32(temp2, temp3);
    temp0 = _mm_packus_epi16(temp0, temp1);
    temp2 = _mm_packus_epi16(temp2, temp3);
    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);

    //Row 6-7 processing
    pu1_pred += 2 * pred_strd;
    pu1_dst += 2 * dst_strd;

    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);

    temp0 = _mm_cvtepu16_epi32(pred_r0);
    pred_r0 = _mm_srli_si128(pred_r0, 8);
    temp2 = _mm_cvtepu16_epi32(pred_r1);
    pred_r1 = _mm_srli_si128(pred_r1, 8);
    temp1 = _mm_cvtepu16_epi32(pred_r0);
    temp3 = _mm_cvtepu16_epi32(pred_r1);

    temp0 = _mm_add_epi32(temp0, value_4x32b);
    temp2 = _mm_add_epi32(temp2, value_4x32b);
    temp1 = _mm_add_epi32(temp1, value_4x32b);
    temp3 = _mm_add_epi32(temp3, value_4x32b);
    temp0 = _mm_packus_epi32(temp0, temp1);
    temp2 = _mm_packus_epi32(temp2, temp3);
    temp0 = _mm_packus_epi16(temp0, temp1);
    temp2 = _mm_packus_epi16(temp2, temp3);
    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
}