C++程序  |  2634行  |  119.18 KB

/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_chroma_intra_pred_filters_atom_intr.c
*
* @brief
*  Contains function Definition for intra prediction  interpolation filters
*
*
* @author
*  Ittiam
*
* @par List of Functions:
*  ihevc_intra_pred_chroma_planar_ssse3()
*
*  ihevc_intra_pred_chroma_dc_ssse3()
*
*  ihevc_intra_pred_chroma_horz_ssse3()
*
*  ihevc_intra_pred_chroma_ver_ssse3()
*
*  ihevc_intra_pred_chroma_mode2_ssse3()
*
*  ihevc_intra_pred_chroma_mode_18_34_ssse3()
*
*  ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
*
*  ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
*
*  ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
*
*  ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
*
*
*
* @remarks
*  None
*
*******************************************************************************
*/


/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

#include "ihevc_typedefs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_intra_pred.h"

#include "ihevc_chroma_intra_pred.h"
#include "ihevc_common_tables.h"
#include "ihevc_tables_x86_intr.h"

#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>

#include <immintrin.h>


/****************************************************************************/
/* Constant Macros                                                          */
/****************************************************************************/
#define MAX_CU_SIZE 64
#define BIT_DEPTH 8
#define T32_4NT 128
#define T16_4NT 64
#define T16C_4NT 64
#define T8C_4NT 32
/****************************************************************************/
/* Function Macros                                                          */
/****************************************************************************/

#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)

/* tables to shuffle 8-bit values */

/*****************************************************************************/
/* Function Definition                                                      */
/*****************************************************************************/



/**
*******************************************************************************
*
* @brief
*  Planar Intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.4 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
                                          WORD32 src_strd,
                                          UWORD8 *pu1_dst,
                                          WORD32 dst_strd,
                                          WORD32 nt,
                                          WORD32 mode)
{

    WORD32 row, col;
    WORD32 log2nt = 5;
    WORD32 two_nt, three_nt;

    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
    UNUSED(src_strd);
    UNUSED(mode);
    switch(nt)
    {
        case 16:
            log2nt = 4;
            break;
        case 8:
            log2nt = 3;
            break;
        case 4:
            log2nt = 2;
            break;
        default:
            break;
    }
    two_nt = 2 * nt;
    three_nt = 3 * nt;

    /* Planar filtering */

/* setting vallues in  registera*/

//  pu1_ref[2*(two_nt - 1 - row)]
//  pu1_ref[2 * (three_nt + 1)]
//  pu1_ref[2 * (two_nt + 1) + col]
//  pu1_ref[2 * (nt - 1)]

    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);

    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);

    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
    const_temp6_4x32b = _mm_set1_epi16(nt);
    const_temp7_4x32b = _mm_set1_epi16(4);

    zero_8x16b = _mm_set1_epi32(0);


    if(nt % 4 == 0)
    {
        const_temp7_4x32b = _mm_set1_epi16(4);

        for(row = 0; row < nt; row++)
        {
            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
            __m128i res_temp3_8x16b;

            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);

            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
            row_8x16b = _mm_set1_epi16((nt - 1 - row));

            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);

            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);

            /*(row + 1) * pu1_ref[nt - 1]*/
            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);

            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);

            for(col = 0; col < 2 * nt; col += 8)
            {
                __m128i src_temp_8x16b;

                /* loding 8bit 16 pixles*/
                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));

                //src_temp_8x16b =  _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
                src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);

                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);

                /*(col + 1) * pu1_ref[three_nt + 1]*/
                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);

                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);

                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);

                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);

                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
            } /* inner loop ends here */
        }
    }
}


/**
*******************************************************************************
*
* @brief
*  Intraprediction for DC mode with reference neighboring  samples location
* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.5 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size (Chroma)
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
                                      WORD32 src_strd,
                                      UWORD8 *pu1_dst,
                                      WORD32 dst_strd,
                                      WORD32 nt,
                                      WORD32 mode)
{

    WORD32 acc_dc_u, acc_dc_v;
    WORD32 dc_val_u, dc_val_v;
    WORD32 row;
    WORD32 log2nt = 5;
    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
    __m128i m_zero = _mm_set1_epi32(0);
    UNUSED(src_strd);
    UNUSED(mode);

    switch(nt)
    {
        case 32:
            log2nt = 5;
            break;
        case 16:
            log2nt = 4;
            break;
        case 8:
            log2nt = 3;
            break;
        case 4:
            log2nt = 2;
            break;
        default:
            break;
    }

    acc_dc_u = 0;
    acc_dc_v = 0;

    /* Calculate DC value for the transform block */

    m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);

    if(nt == 16)
    {
        __m128i temp_sad, sign_8x16b;

        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));

        src_temp5  = _mm_unpacklo_epi8(src_temp3, m_zero);
        src_temp6  = _mm_unpacklo_epi8(src_temp4, m_zero);
        src_temp9  = _mm_unpacklo_epi8(src_temp7, m_zero);
        src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);

        src_temp3 = _mm_srli_si128(src_temp3, 8);
        src_temp4 = _mm_srli_si128(src_temp4, 8);
        src_temp7 = _mm_srli_si128(src_temp7, 8);
        src_temp8 = _mm_srli_si128(src_temp8, 8);

        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
        src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
        src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);

        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }

    else if(nt == 8)
    {
        __m128i temp_sad, sign_8x16b;
        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));

        src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
        src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);

        src_temp3 = _mm_srli_si128(src_temp3, 8);
        src_temp4 = _mm_srli_si128(src_temp4, 8);

        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);

        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }

    else if(nt == 4)
    {
        __m128i temp_sad, sign_8x16b;
        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));

        src_temp5 =  _mm_unpacklo_epi8(src_temp3, m_zero);
        src_temp4 = _mm_srli_si128(src_temp3, 8);

        src_temp4 =  _mm_unpacklo_epi8(src_temp4, m_zero);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);

        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);

        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }


    acc_dc_u += pu1_ref[6 * nt];
    acc_dc_v += pu1_ref[6 * nt + 1];

    acc_dc_u -= pu1_ref[4 * nt];
    acc_dc_v -= pu1_ref[4 * nt + 1];

    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);

    dc_val_u = dc_val_u | (dc_val_v << 8);

    /* Fill the remaining rows with DC value*/

    if(nt == 4)
    {
        src_temp1 = _mm_set1_epi16(dc_val_u);

        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);

    }
    else if(nt == 8)
    {
        src_temp1 = _mm_set1_epi16(dc_val_u);

        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);

        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);

    }

    else /* nt == 16 */
    {
        src_temp1 = _mm_set1_epi16(dc_val_u);

        for(row = 0; row < nt; row += 8)
        {
            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);

            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);

            pu1_dst += 8 * dst_strd;
        }
    }

}


/**
*******************************************************************************
*
* @brief
*  Horizontal intraprediction(mode 10) with reference  samples location
* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.6 in the standard (Special case)
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_dst,
                                        WORD32 dst_strd,
                                        WORD32 nt,
                                        WORD32 mode)
{

    WORD32 row;
    __m128i temp1, temp2, temp3, temp4, temp5, temp6,  temp7, temp8;
    UNUSED(src_strd);
    UNUSED(mode);

    /* Replication to next rows*/

    if(nt == 8)
    {
        for(row = 0; row < nt; row += 4)
        {
            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);

            temp2 = _mm_unpacklo_epi8(temp1, temp2);
            temp4 = _mm_unpacklo_epi8(temp3, temp4);
            temp6 = _mm_unpacklo_epi8(temp5, temp6);
            temp8 = _mm_unpacklo_epi8(temp7, temp8);

            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);

        }
    }
    else if(nt == 16)
    {
        for(row = 0; row < nt; row += 4)
        {
            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);

            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);

            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);

            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);

            temp2 = _mm_unpacklo_epi8(temp1, temp2);
            temp4 = _mm_unpacklo_epi8(temp3, temp4);
            temp6 = _mm_unpacklo_epi8(temp5, temp6);
            temp8 = _mm_unpacklo_epi8(temp7, temp8);

            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);

            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);

            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);

            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);


        }
    }
    else
    {
        temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
        temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);

        temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
        temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);

        temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
        temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);

        temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
        temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);

        temp2 = _mm_unpacklo_epi8(temp1, temp2);
        temp4 = _mm_unpacklo_epi8(temp3, temp4);
        temp6 = _mm_unpacklo_epi8(temp5, temp6);
        temp8 = _mm_unpacklo_epi8(temp7, temp8);

        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
    }
}


/**
*******************************************************************************
*
* @brief
*  Horizontal intraprediction with reference neighboring  samples location
* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.6 in the standard (Special case)
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
                                       WORD32 src_strd,
                                       UWORD8 *pu1_dst,
                                       WORD32 dst_strd,
                                       WORD32 nt,
                                       WORD32 mode)
{
    __m128i src_temp1;
    UNUSED(src_strd);
    UNUSED(mode);

    /* Replication to next columns*/
    if(nt == 8)
    {
        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));

        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);

        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);

    }
    if(nt == 16)
    {
        __m128i temp1, temp2;

        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));

        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);

        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);

        _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);

        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);

    }
    else
    {
        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));

        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);


    }

}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 2 (sw angle) with reference  neighboring samples
* location pointed by 'pu1_ref' to the  TU block location pointed by
* 'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
                                         WORD32 src_strd,
                                         UWORD8 *pu1_dst,
                                         WORD32 dst_strd,
                                         WORD32 nt,
                                         WORD32 mode)
{
    WORD32 row, col;


    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
    UNUSED(src_strd);
    UNUSED(mode);

    sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);

    /* For the angle 45, replication is done from the corresponding angle */
    /* intra_pred_ang = tan(angle) in q5 format */

    if(nt == 4)
    {
        /*pu1_ref[two_nt - row - (col+1) - 1]*/
        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));

        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));

    }
    else if(nt == 8)
    {
        /*pu1_ref[two_nt - row - (col+1) - 1]*/
        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
        src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
        src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
        src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
        src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));

        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));


    }
    else
    {
        for(row = 0; row < nt; row += 8)
        {
            for(col = 0; col < 2 * nt; col += 16)
            {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));

                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
            }
        }
    }
}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
                                              WORD32 src_strd,
                                              UWORD8 *pu1_dst,
                                              WORD32 dst_strd,
                                              WORD32 nt,
                                              WORD32 mode)
{
    WORD32 row;
    WORD32 idx = 0;

    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
    UNUSED(src_strd);

    if(mode == 34)
    {
        if(nt == 4)
        {
            /*pu1_ref[two_nt + col + idx + 1]*/
            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));

            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);

        }
        else if(nt == 8)
        {
            /*pu1_ref[two_nt + col + idx + 1]*/
            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));

            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);


        }
        else
        {
            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
            for(row = 0; row < nt; row += 8)
            {
                /*pu1_ref[two_nt + col + idx + 1]*/
                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));

                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);

                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));

                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);

                pu1_ref += 2 * 8;
                pu1_dst += 8 * dst_strd;
            }
        }
    }
    else
    {
        if(nt == 4)
        {
            /*pu1_ref[two_nt + col + idx + 1]*/
            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));

            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);


        }
        else if(nt == 8)
        {
            /*pu1_ref[two_nt + col + idx + 1]*/
            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));

            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);


        }
        else
        {
            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
            for(row = 0; row < nt; row += 8)
            {
                /*pu1_ref[two_nt + col + idx + 1]*/
                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));

                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);

                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));

                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);

                pu1_ref -= 2 * 8;
                pu1_dst += 8 * dst_strd;
            }
        }
    }

}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
                                               WORD32 src_strd,
                                               UWORD8 *pu1_dst,
                                               WORD32 dst_strd,
                                               WORD32 nt,
                                               WORD32 mode)
{
    WORD32 row, col;

    WORD32 intra_pred_ang;

    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
    UNUSED(src_strd);

    /* Intra Pred Angle according to the mode */
    intra_pred_ang = gai4_ihevc_ang_table[mode];

    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
    /* samples dependent on distance to obtain destination sample */

    sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
    const_temp_4x32b  = _mm_set1_epi16(16);
    const_temp2_4x32b = _mm_set1_epi32(31);
    const_temp3_4x32b = _mm_set1_epi16(32);
    const_temp4_4x32b = _mm_set1_epi32(4);

    two_nt_4x32b = _mm_set1_epi32(1);

    zero_8x16b = _mm_set1_epi16(0);


    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);

    row_4x32b = _mm_set_epi32(4, 3, 2, 1);

    if(nt == 4)
    {
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
        const_temp2_4x32b = _mm_set1_epi16(31);
        const_temp4_4x32b = _mm_set1_epi16(4);
        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);

        {
            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
            __m128i src_values10;

            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);

            /* fract = pos & (31); */
            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);

            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);

            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);

            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);

            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/

            {
                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;

                /* loding 8-bit 16 pixels */
                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/

                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/

                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/

                /* converting 16 bit to 8 bit */
                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/

                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);

                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);

                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/

                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/

                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/

                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/

            }
        }
    }
    else
    {
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
        const_temp2_4x32b = _mm_set1_epi16(31);
        const_temp4_4x32b = _mm_set1_epi16(8);
        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);

        for(col = 0; col < 2 * nt; col += 16)
        {
            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;

            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);

            /* fract = pos & (31); */
            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);

            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);

            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);

            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);

            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/

            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/

            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/

            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);

            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/

            for(row = 0; row < nt; row += 4)
            {
                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;

                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;

                /* loding 8-bit 16 pixels */
                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/

                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/

                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/

                /* loding 8-bit 16 pixels */
                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/

                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/

                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/

                /* converting 16 bit to 8 bit */
                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/

                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);

                /* converting 16 bit to 8 bit */
                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/

                src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
                src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
                src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
                src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);

                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);

                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);

                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);

                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);

                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);

                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/

            }
        }
    }
}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
* with reference  neighboring samples location pointed by 'pu1_ref' to the
* TU block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/


void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
                                                 WORD32 src_strd,
                                                 UWORD8 *pu1_dst,
                                                 WORD32 dst_strd,
                                                 WORD32 nt,
                                                 WORD32 mode)
{
    /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
    /* for ref main & side samples assignment,can be combined for */
    /* optimzation*/

    WORD32 row, col, k;
    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
    WORD32 ref_idx;


    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;

    UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
    UWORD8 *ref_main;
    UNUSED(src_strd);

    inv_ang_sum = 128;

    intra_pred_ang = gai4_ihevc_ang_table[mode];

    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
    /* Intermediate reference samples for negative angle modes */
    /* This have to be removed during optimization*/

    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */


    ref_main = ref_temp + 2 * nt;
    for(k = 0; k < (2 * (nt + 1)); k += 2)
    {
        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
    }

    ref_main = ref_temp + (2 * (nt - 1));
    ref_idx = (nt * intra_pred_ang) >> 5;

    /* SIMD Optimization can be done using look-up table for the loop */
    /* For negative angled derive the main reference samples from side */
    /* reference samples refer to section 8.4.4.2.6 */

    for(k = -2; k > (2 * ref_idx); k -= 2)
    {
        inv_ang_sum += inv_ang;
        ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
    }

    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
    /* samples dependent on distance to obtain destination sample */

    const_temp_4x32b  = _mm_set1_epi16(16);
    const_temp2_4x32b = _mm_set1_epi32(31);
    const_temp3_4x32b = _mm_set1_epi16(32);
    const_temp4_4x32b = _mm_set1_epi32(4);

    two_nt_4x32b = _mm_set1_epi32(1);

    zero_8x16b = _mm_set1_epi16(0);


    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);

    row_4x32b = _mm_set_epi32(4, 3, 2, 1);

    if(nt == 4)
    {
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
        const_temp2_4x32b = _mm_set1_epi16(31);
        const_temp4_4x32b = _mm_set1_epi16(4);
        two_nt_4x32b = _mm_set1_epi16(1);

        {
            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
            __m128i src_values10;

            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);

            /* fract = pos & (31); */
            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);

            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);

            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/

            {
                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;

                /* loding 8-bit 16 pixels */
                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/

                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/

                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/

                /* converting 16 bit to 8 bit */
                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/

                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);

                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/

                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/

                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/

                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/

            }
        }
    }
    else
    {
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
        const_temp2_4x32b = _mm_set1_epi16(31);
        const_temp4_4x32b = _mm_set1_epi16(8);
        two_nt_4x32b = _mm_set1_epi16(1);

        for(col = 0; col < 2 * nt; col += 16)
        {
            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;

            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);

            /* fract = pos & (31); */
            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);

            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);

            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/

            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/

            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/

            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);

            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/

            for(row = 0; row < nt; row += 4)
            {
                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;

                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;

                /* loding 8-bit 16 pixels */
                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/

                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/

                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/

                /* loding 8-bit 16 pixels */
                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/

                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/

                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);

                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);

                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/

                /* converting 16 bit to 8 bit */
                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/

                /* converting 16 bit to 8 bit */
                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/

                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);

                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);

                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);

                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);

                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);

                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/

            }
        }
    }
}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
                                                 WORD32 src_strd,
                                                 UWORD8 *pu1_dst,
                                                 WORD32 dst_strd,
                                                 WORD32 nt,
                                                 WORD32 mode)
{
    WORD32 row, k;
    WORD32 intra_pred_ang, idx;
    WORD32 inv_ang, inv_ang_sum, pos, fract;
    WORD32 ref_main_idx, ref_idx;
    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
    UWORD8 *ref_main;

    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
    UNUSED(src_strd);

    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
    inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];

    /* Intermediate reference samples for negative angle modes */
    /* This have to be removed during optimization*/
    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
    ref_main = ref_temp + 2 * nt;
    for(k = 0; k < (2 * (nt + 1)); k += 2)
    {
        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
    }

    ref_idx = (nt * intra_pred_ang) >> 5;
    inv_ang_sum = 128;
    ref_main = ref_temp + (2 * (nt - 1));
    /* SIMD Optimization can be done using look-up table for the loop */
    /* For negative angled derive the main reference samples from side */
    /*  reference samples refer to section 8.4.4.2.6 */
    for(k = -2; k > (2 * ref_idx); k -= 2)
    {
        inv_ang_sum += inv_ang;
        ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
    }

    const_temp_8x16b = _mm_set1_epi16(16);

    if(nt == 4) /* if nt =4*/
    {
        __m128i const_temp2_4x32b, const_temp3_4x32b;
        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
        __m128i row_4x32b, two_nt_4x32b, src_values12;


        const_temp2_4x32b = _mm_set1_epi32(31);
        const_temp3_4x32b = _mm_set1_epi32(32);

        two_nt_4x32b = _mm_set1_epi32(2);

        zero_8x16b = _mm_set1_epi16(0);

        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);

        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
        {
            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);

            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));

            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/

            /* fract = pos & (31); */
            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);

            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

// inner loop starts from here
            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));  /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));  /* col = 8-15  */
            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);
            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/

        }
    }
    else if(nt == 8) /* for nt = 16 case */
    {
        WORD32 ref_main_idx1, fract1, temp, temp1;
        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;

        zero_8x16b = _mm_set1_epi16(0);

        for(row = 0; row < nt; row += 2)
        {
            __m128i src_values0, src_values1, src_values2, src_values3;
            __m128i  src_values10, src_values11, src_values12, src_values13;

            pos = ((row + 1) * intra_pred_ang);
            idx = pos >> 5;
            fract = pos & (31);
            temp = 32 - fract;
            ref_main_idx = 2 * idx + 2; /* col from 0-15 */

            pos = ((row + 2) * intra_pred_ang);
            idx = pos >> 5;
            fract1 = pos & (31);
            temp1 = 32 - fract1;
            ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */

            fract_8x16b  = _mm_set1_epi8(fract);
            fract1_8x16b = _mm_set1_epi8(fract1);
            temp_8x16b   = _mm_set1_epi8(temp);
            temp1_8x16b  = _mm_set1_epi8(temp1);

            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);

            /* row=0 */
            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */

            /* row=1 */
            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));   /* col = 0-7  */
            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8));  /* col = 8-15 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);

            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);

            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);

            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);

            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            /* loding 8-bit 8 pixels values */
            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);

            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);

            pu1_dst += 2 * dst_strd;
        }
    }
    else if(nt == 16)
    {
        WORD32 temp;
        /* unroll the col loop (inner) */
        zero_8x16b = _mm_set1_epi16(0);

        for(row = 0; row < nt; row += 1)
        {
            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
            __m128i  src_values10, src_values11, src_values12, src_values13;

            pos = ((row + 1) * intra_pred_ang);
            idx = pos >> 5;
            fract = pos & (31);
            temp = 32 - fract;
            ref_main_idx = 2 * idx + 2; /* col from 0-31 */

            fract_8x16b = _mm_set1_epi8(fract);
            temp_8x16b  = _mm_set1_epi8(temp);

            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);

            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16));  /* col = 16-23 */
            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24));  /* col = 24-31 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);
            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            /* loding 8-bit 8 pixels values */
            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);

            pu1_dst += dst_strd;

        }
    }
}


/**
*******************************************************************************
*
* @brief
*  Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
                                                 WORD32 src_strd,
                                                 UWORD8 *pu1_dst,
                                                 WORD32 dst_strd,
                                                 WORD32 nt,
                                                 WORD32 mode)
{
    WORD32 row;
    WORD32 pos, fract;
    WORD32 intra_pred_ang;
    WORD32 idx, ref_main_idx;

    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
    UNUSED(src_strd);

    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
    const_temp_8x16b = _mm_set1_epi16(16);

    if(nt == 4) /* if nt =4*/
    {
        __m128i const_temp2_4x32b, const_temp3_4x32b;
        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
        __m128i row_4x32b, two_nt_4x32b, src_values12;

        const_temp2_4x32b = _mm_set1_epi32(31);
        const_temp3_4x32b = _mm_set1_epi32(32);

        two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);

        zero_8x16b = _mm_set1_epi16(0);

        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);

        {
            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];

            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;

            /* pos = ((row + 1) * intra_pred_ang); */
            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);

            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));

            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/

            /* fract = pos & (31); */
            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);

            /*(32 - fract) */
            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);

            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);

            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/

            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/

            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);

// inner loop starts from here
            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));  /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));  /* col = 8-15  */
            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);
            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/

        }
    }

    else if(nt == 8) /* for nt = 16 case */
    {
        WORD32 ref_main_idx1, fract1, temp, temp1;
        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;

        zero_8x16b = _mm_set1_epi16(0);

        for(row = 0; row < nt; row += 2)
        {
            __m128i src_values0, src_values1, src_values2, src_values3;
            __m128i  src_values10, src_values11, src_values12, src_values13;

            pos = ((row + 1) * intra_pred_ang);
            idx = pos >> 5;
            fract = pos & (31);
            temp = 32 - fract;
            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */

            pos = ((row + 2) * intra_pred_ang);
            idx = pos >> 5;
            fract1 = pos & (31);
            temp1 = 32 - fract1;
            ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */

            fract_8x16b  = _mm_set1_epi8(fract);
            fract1_8x16b = _mm_set1_epi8(fract1);
            temp_8x16b   = _mm_set1_epi8(temp);
            temp1_8x16b  = _mm_set1_epi8(temp1);

            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);

            /* row=0 */
            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */

            /* row=1 */
            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));    /* col = 0-7  */
            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8));  /* col = 8-15 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);

            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);

            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);

            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);

            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            /* loding 8-bit 8 pixels values */
            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);

            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);

            pu1_dst += 2 * dst_strd;
        }
    }
    else if(nt == 16)
    {
        WORD32 temp;
        /* unroll the col loop (inner) */
        zero_8x16b = _mm_set1_epi16(0);

        for(row = 0; row < nt; row += 1)
        {
            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
            __m128i  src_values10, src_values11, src_values12, src_values13;

            pos = ((row + 1) * intra_pred_ang);
            idx = pos >> 5;
            fract = pos & (31);
            temp = 32 - fract;
            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */

            fract_8x16b = _mm_set1_epi8(fract);
            temp_8x16b  = _mm_set1_epi8(temp);

            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);

            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16));  /* col = 16-23 */
            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24));  /* col = 24-31 */

            src_values10 = _mm_srli_si128(src_values0, 2);
            src_values11 = _mm_srli_si128(src_values1, 2);
            src_values12 = _mm_srli_si128(src_values2, 2);
            src_values13 = _mm_srli_si128(src_values3, 2);

            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);

            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);

            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
            src_values0 = _mm_srai_epi16(src_values0,  5);
            src_values1 = _mm_srai_epi16(src_values1,  5);
            src_values2 = _mm_srai_epi16(src_values2,  5);
            src_values3 = _mm_srai_epi16(src_values3,  5);

            /* converting 16 bit to 8 bit */
            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);

            /* loding 8-bit 8 pixels values */
            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);

            pu1_dst += dst_strd;

        }
    }
}