ih264_deblk_luma_ssse3.c - Android社区 - https://www.androidos.net.cn/

/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*                                                                           */
/*  File Name         : ih264_deblk_luma_ssse3.c                             */
/*                                                                           */
/*  Description       : Contains function definitions for deblocking         */
/*                                                                           */
/*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
/*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
/*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
/*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
/*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
/*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
/*                                                                           */
/*  Issues / Problems : None                                                 */
/*                                                                           */
/*  Revision History  :                                                      */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
/*                                      intrinsics                           */
/*                                                                           */
/*****************************************************************************/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include <stdio.h>

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_platform_macros.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_macros.h"

/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
/*                                                                           */
/*  Description   : This function performs filtering of a luma block         */
/*                  vertical edge when the boundary strength is set to 4.    */
/*                                                                           */
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
/*                  src_strd   - source stride                               */
/*                  alpha      - alpha value for the boundary                */
/*                  beta       - beta value for the boundary                 */
/*                                                                           */
/*  Globals       : None                                                     */
/*                                                                           */
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
/*                  title "Filtering process for edges for bS equal to 4" in */
/*                  ITU T Rec H.264.                                         */
/*                                                                           */
/*  Outputs       : None                                                     */
/*                                                                           */
/*  Returns       : None                                                     */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Initial version                      */
/*                                                                           */
/*****************************************************************************/
void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
                                     WORD32 src_strd,
                                     WORD32 alpha,
                                     WORD32 beta)
{
    __m128i zero = _mm_setzero_si128();
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
    __m128i q0_16x8_1;
    __m128i p0_16x8_1;
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
    __m128i Alpha_8x16, Beta_8x16;
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;

Alpha_8x16 = _mm_set1_epi16(alpha);
    Beta_8x16 = _mm_set1_epi16(beta);

line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));

temp1 = _mm_unpacklo_epi8(line1, line2);
    temp2 = _mm_unpacklo_epi8(line3, line4);
    temp3 = _mm_unpacklo_epi8(line5, line6);
    temp4 = _mm_unpacklo_epi8(line7, line8);

line1 = _mm_unpacklo_epi16(temp1, temp2);
    line2 = _mm_unpackhi_epi16(temp1, temp2);
    line3 = _mm_unpacklo_epi16(temp3, temp4);
    line4 = _mm_unpackhi_epi16(temp3, temp4);

p1_8x16 = _mm_unpacklo_epi32(line1, line3);
    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
    q1_8x16 = _mm_unpackhi_epi32(line2, line4);

line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));

temp1 = _mm_unpacklo_epi8(line1, line2);
    temp2 = _mm_unpacklo_epi8(line3, line4);
    temp3 = _mm_unpacklo_epi8(line5, line6);
    temp4 = _mm_unpacklo_epi8(line7, line8);

line1 = _mm_unpacklo_epi16(temp1, temp2);
    line2 = _mm_unpackhi_epi16(temp1, temp2);
    line3 = _mm_unpacklo_epi16(temp3, temp4);
    line4 = _mm_unpackhi_epi16(temp3, temp4);

temp1 = _mm_unpacklo_epi32(line1, line3);
    temp2 = _mm_unpackhi_epi32(line1, line3);
    temp3 = _mm_unpacklo_epi32(line2, line4);
    temp4 = _mm_unpackhi_epi32(line2, line4);

p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);

//Cond1 (ABS(p0 - q0) < alpha)
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);

temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);

flag1_16x8 = _mm_packs_epi16(temp2, temp1);

//Cond2 (ABS(q1 - q0) < beta)
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);

temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);

flag2_16x8 = _mm_packs_epi16(temp2, temp1);

flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);

//Cond3 (ABS(p1 - p0) < beta)
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);

temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);

flag2_16x8 = _mm_packs_epi16(temp2, temp1);

// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);

// (ABS(p0 - q0) < ((alpha >> 2) + 2))
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);

flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);

// (ABS(p2 - p0) < beta)
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);

flag3_16x8 = _mm_packs_epi16(temp2, temp1);
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);

// (ABS(q2 - q0) < beta)
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
    temp1 = _mm_add_epi8(temp1, temp2);

temp2 = _mm_unpacklo_epi8(temp1, zero);
    temp1 = _mm_unpackhi_epi8(temp1, zero);
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);

flag4_16x8 = _mm_packs_epi16(temp2, temp1);
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);

// First 8 pixels
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);

// p0_1 and q0_1
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    temp3 = _mm_slli_epi16(p1_8x16, 1);
    temp4 = _mm_slli_epi16(q1_8x16, 1);
    temp1 = _mm_add_epi16(temp5, temp3);
    temp2 = _mm_add_epi16(temp6, temp4);
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);

// p1_2 and q1_2
    temp6 = _mm_add_epi16(temp6, p0_8x16);
    temp5 = _mm_add_epi16(temp5, q0_8x16);
    temp1 = _mm_add_epi16(temp6, p2_8x16);
    temp2 = _mm_add_epi16(temp5, q2_8x16);
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);

// p0_2 and q0_2
    temp1 = _mm_add_epi16(temp3, p2_8x16);
    temp2 = _mm_add_epi16(temp4, q2_8x16);
    temp1 = _mm_add_epi16(temp1, q1_8x16);
    temp2 = _mm_add_epi16(temp2, p1_8x16);
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    temp3 = _mm_slli_epi16(temp3, 1);
    temp1 = _mm_add_epi16(temp1, temp3);
    temp2 = _mm_add_epi16(temp2, temp3);
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);

// p2_2 and q2_2
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    temp3 = _mm_slli_epi16(p2_8x16, 1);
    temp4 = _mm_slli_epi16(q2_8x16, 1);
    temp3 = _mm_add_epi16(p2_8x16, temp3);
    temp4 = _mm_add_epi16(q2_8x16, temp4);
    temp5 = _mm_slli_epi16(p3_8x16, 1);
    temp6 = _mm_slli_epi16(q3_8x16, 1);
    temp1 = _mm_add_epi16(temp1, temp3);
    temp2 = _mm_add_epi16(temp2, temp4);
    temp1 = _mm_add_epi16(temp1, temp5);
    temp2 = _mm_add_epi16(temp2, temp6);
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);

// Second 8 pixels and packing with first 8 pixels
    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);

// p0_1 and q0_1
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    temp3 = _mm_slli_epi16(p1_8x16, 1);
    temp4 = _mm_slli_epi16(q1_8x16, 1);
    temp1 = _mm_add_epi16(temp5, temp3);
    temp2 = _mm_add_epi16(temp6, temp4);
    temp1 = _mm_srai_epi16(temp1, 2);
    temp2 = _mm_srai_epi16(temp2, 2);
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);

// p1_2 and q1_2
    temp6 = _mm_add_epi16(temp6, p0_8x16);
    temp5 = _mm_add_epi16(temp5, q0_8x16);
    temp1 = _mm_add_epi16(temp6, p2_8x16);
    temp2 = _mm_add_epi16(temp5, q2_8x16);
    temp1 = _mm_srai_epi16(temp1, 2);
    temp2 = _mm_srai_epi16(temp2, 2);
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);

// p0_2 and q0_2
    temp1 = _mm_add_epi16(temp3, p2_8x16);
    temp2 = _mm_add_epi16(temp4, q2_8x16);
    temp1 = _mm_add_epi16(temp1, q1_8x16);
    temp2 = _mm_add_epi16(temp2, p1_8x16);
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    temp3 = _mm_slli_epi16(temp3, 1);
    temp1 = _mm_add_epi16(temp1, temp3);
    temp2 = _mm_add_epi16(temp2, temp3);
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    temp1 = _mm_srai_epi16(temp1, 3);
    temp2 = _mm_srai_epi16(temp2, 3);
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);

// p2_2 and q2_2
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    temp3 = _mm_slli_epi16(p2_8x16, 1);
    temp4 = _mm_slli_epi16(q2_8x16, 1);
    temp3 = _mm_add_epi16(p2_8x16, temp3);
    temp4 = _mm_add_epi16(q2_8x16, temp4);
    temp5 = _mm_slli_epi16(p3_8x16, 1);
    temp6 = _mm_slli_epi16(q3_8x16, 1);
    temp1 = _mm_add_epi16(temp1, temp3);
    temp2 = _mm_add_epi16(temp2, temp4);
    temp1 = _mm_add_epi16(temp1, temp5);
    temp2 = _mm_add_epi16(temp2, temp6);
    temp1 = _mm_srai_epi16(temp1, 3);
    temp2 = _mm_srai_epi16(temp2, 3);
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);

// p0 and q0
    p0_16x8 = _mm_and_si128(p0_16x8,
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
    q0_16x8 = _mm_and_si128(q0_16x8,
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);

// p0 and q0
    p0_16x8 = _mm_and_si128(p0_16x8,
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
    q0_16x8 = _mm_and_si128(q0_16x8,
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);

// p1 and q1
    p1_16x8 = _mm_and_si128(p1_16x8,
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
    q1_16x8 = _mm_and_si128(q1_16x8,
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);

// p2 and q2
    p2_16x8 = _mm_and_si128(p2_16x8,
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
    q2_16x8 = _mm_and_si128(q2_16x8,
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);

temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);

p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);

line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
    line2 = _mm_srli_si128(line1, 8);
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
    line4 = _mm_srli_si128(line3, 8);
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
    line6 = _mm_srli_si128(line5, 8);
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
    line8 = _mm_srli_si128(line7, 8);

_mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);

temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
    temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
    temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
    temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);

p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);

_mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);

}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
/*                                                                           */
/*  Description   : This function performs filtering of a luma block         */
/*                  horizontal edge when the boundary strength is set to 4.  */
/*                                                                           */
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
/*                  src_strd   - source stride                               */
/*                  alpha      - alpha value for the boundary                */
/*                  beta       - beta value for the boundary                 */
/*                                                                           */
/*  Globals       : None                                                     */
/*                                                                           */
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
/*                  title "Filtering process for edges for bS equal to 4" in */
/*                  ITU T Rec H.264.                                         */
/*                                                                           */
/*  Outputs       : None                                                     */
/*                                                                           */
/*  Returns       : None                                                     */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Initial version                      */
/*                                                                           */
/*****************************************************************************/
void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
                                     WORD32 src_strd,
                                     WORD32 alpha,
                                     WORD32 beta)
{
    WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
    WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
    UWORD8 *pu1_HorzPixel;
    __m128i zero = _mm_setzero_si128();
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
    __m128i q0_16x8_1;
    __m128i p0_16x8_1;
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
    __m128i Alpha_8x16, Beta_8x16;
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
    __m128i const_val2_16x8 = _mm_set1_epi16(2);

pu1_HorzPixel = pu1_src - (src_strd << 2);

i16_posQ1 = src_strd;
    i16_posQ2 = X2(src_strd);
    i16_posQ3 = X3(src_strd);
    i16_posP0 = X3(src_strd);
    i16_posP1 = X2(src_strd);
    i16_posP2 = src_strd;
    i16_posP3 = 0;

Alpha_8x16 = _mm_set1_epi16(alpha);
    Beta_8x16 = _mm_set1_epi16(beta);

p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
    q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));