/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_sao_atom_intr.c
*
* @brief
*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
* filtering
*
* @author
* 100592
*
* @par List of Functions:
*   - ihevc_sao_band_offset_luma_ssse3()
*   - ihevc_sao_band_offset_chroma_ssse3()
*   - ihevc_sao_edge_offset_class0_ssse3()
*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
*   - ihevc_sao_edge_offset_class1_ssse3()
*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
*   - ihevc_sao_edge_offset_class2_ssse3()
*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
*   - ihevc_sao_edge_offset_class3_ssse3()
*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
*
* @remarks
*  None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
#include <stdio.h>

#include "ihevc_typedefs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_defs.h"
#include "ihevc_tables_x86_intr.h"
#include "ihevc_common_tables.h"
#include "ihevc_sao.h"

#include <immintrin.h>

#define NUM_BAND_TABLE  32
/**
*******************************************************************************
*
* @brief
* Has two sets of functions : band offset and edge offset both for luma and chroma
* edge offset has horizontal ,vertical, 135 degree and 45 degree
*
* @par Description:
*
*
* @param[in-out] pu1_src
*  Pointer to the source
*
* @param[in] src_strd
*  Source stride
*
* @param[in-out] pu1_src_left
*  source left boundary
*
* @param[in-out] pu1_src_top
* Source top boundary
*
* @param[in-out] pu1_src_top_left
*  Source top left boundary
*
* @param[in] pu1_src_top_right
*  Source top right boundary
*
* @param[in] pu1_src_bot_left
*  Source bottom left boundary
*
* @param[in] pu1_avail
*  boundary availability flags
*
* @param[in] pi1_sao_offset_u
*  Chroma U sao offset values
*
* @param[in] pi1_sao_offset_v
*  Chroma V sao offset values
*
* @param[in] pi1_sao_offset
*  Luma sao offset values
*
* @param[in] wd
*  width of the source

* @param[in] ht
*  height of the source
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/


void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
                                      WORD32 src_strd,
                                      UWORD8 *pu1_src_left,
                                      UWORD8 *pu1_src_top,
                                      UWORD8 *pu1_src_top_left,
                                      WORD32 sao_band_pos,
                                      WORD8 *pi1_sao_offset,
                                      WORD32 wd,
                                      WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_cpy;
    WORD32 wd_rem;
    WORD8 offset = 0;

    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
    __m128i band_pos_16x8b;
    __m128i sao_offset;
    __m128i cmp_mask, cmp_store;

    /* Updating left and top-left and top */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
    }
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
    for(col = 0; col < wd; col += 8)
    {
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
        offset += 8;
    }

    //replicating sao_band_pos as 8 bit value 16 times


    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
    //value set for sao_offset extraction
    tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
    tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
    tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
    tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);

    //loaded sao offset values
    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);

    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));

    //band_position addition
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
    //sao_offset duplication
    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
    //settng for comparision
    cmp_mask = _mm_set1_epi16(16);
    cmp_store = _mm_set1_epi16(0x00ff);

    //sao_offset addition
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
    //masking upper 8bit values of each  16 bit band table value
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);

    switch(sao_band_pos)
    {
        case 0:
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
            break;
        case 28:
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
            break;
        case 29:
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
            break;
        case 30:
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
            break;
        case 31:
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
            break;
        default:
            break;
    }
    //sao_offset is reused for zero cmp mask.
    sao_offset = _mm_setzero_si128();
    tmp_set_128i_1 = _mm_set1_epi8(1);
    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);

    //masking upper 8bit values of each  16 bit band table value
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);

    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);

    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31

    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);

    for(col = wd; col >= 16; col -= 16)
    {
        pu1_src_cpy = pu1_src;
        for(row = ht; row > 0; row -= 2)
        {


            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            // row = 1
            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));



            //saturated substract 8 bit
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
            //if the values less than 0 put ff
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
            //if the values gret=ater than 31 put ff
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);


            //row 0 and row1
            //if the values >16 then put ff ,cmp_mask = dup16(15)
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
            //values 16 to 31 for row 0 & 1 but values <16 ==0
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
            // values 0 to 15 for row 0 & 1
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
            //row 2 and  row 3
            //if the values >16 then put ff ,cmp_mask = dup16(15)
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
            //values 16 to 31 for row 2 & 3 but values <16 ==0
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
            // values 0 to 15 for row 2 & 3
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);

            //row 0 and row 1
            //to preserve pixel values in which no offset needs to be added.
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);

            //row 2 and row 3
            //to preserve pixel values in which no offset needs to be added.
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);

            //indexing 0 - 15 bandtable indexes
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
            // combining all offsets results
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
            // combing results woth the pixel values
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);


            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
            // row = 1
            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);

            pu1_src_cpy += (src_strd << 1);
        }
        pu1_src += 16;
    }
    wd_rem = wd & 0xF;
    if(wd_rem)
    {pu1_src_cpy = pu1_src;
        for(row = ht; row > 0; row -= 4)
        {


            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
            // row = 1
            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
            // row = 2
            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
            // row = 3
            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
            //row0 and row1 packed and row2 and row3 packed

            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);

            //saturated substract 8 bit
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
            //if the values less than 0 put ff
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
            //if the values gret=ater than 31 put ff
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);



            //row 0 and row1
            //if the values >16 then put ff ,cmp_mask = dup16(15)
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
            //values 16 to 31 for row 0 & 1 but values <16 ==0
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
            // values 0 to 15 for row 0 & 1
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
            //row 2 and  row 3
            //if the values >16 then put ff ,cmp_mask = dup16(15)
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
            //values 16 to 31 for row 2 & 3 but values <16 ==0
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
            // values 0 to 15 for row 2 & 3
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);

            //row 0 and row 1
            //to preserve pixel values in which no offset needs to be added.
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);

            //row 2 and row 3
            //to preserve pixel values in which no offset needs to be added.
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);

            //indexing 0 - 15 bandtable indexes
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
            // combining all offsets results
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
            // combing results woth the pixel values
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);

            //Getting row1 separately
            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
            //Getting row3 separately
            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);

            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
            // row = 1
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
            // row = 2
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
            // row = 3
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);

            pu1_src_cpy += (src_strd << 2);

        }
        pu1_src += 8;
    }


}

void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_src_left,
                                        UWORD8 *pu1_src_top,
                                        UWORD8 *pu1_src_top_left,
                                        WORD32 sao_band_pos_u,
                                        WORD32 sao_band_pos_v,
                                        WORD8 *pi1_sao_offset_u,
                                        WORD8 *pi1_sao_offset_v,
                                        WORD32 wd,
                                        WORD32 ht)
{
    WORD32 row, col;
    WORD8 offset = 0;


    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
    __m128i cmp_msk2;
    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
    __m128i sao_offset;
    __m128i cmp_mask;


    /* Updating left and top and top-left */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
    }
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    for(col = 0; col < wd; col += 8)
    {
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
        offset += 8;
    }

    { // band _table creation
        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
        // Band table for U component : band_table0_16x8b and band_table2_16x8b
        //replicating sao_band_pos as 8 bit value 16 times
        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
        //value set for sao_offset extraction
        tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
        tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
        tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
        tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);

        //loaded sao offset values
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);

        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));

        //band_position addition
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
        //sao_offset duplication
        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);

        //sao_offset addition
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
        //reuse for clipping
        temp1_8x16b = _mm_set1_epi16(0x00ff);
        //settng for comparision
        cmp_mask = _mm_set1_epi16(16);

        //masking upper 8bit values of each  16 bit band table value
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);

        //temp1_8x16b reuse for compare storage
        switch(sao_band_pos_u)
        {
            case 0:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
                break;
            case 28:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
                break;
            case 29:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
                break;
            case 30:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
                break;
            case 31:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
                break;
            default:
                break;
        }
        //masking upper 8bit values of each  16 bit band table value
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
        // Band table for U component over

        // Band table for V component : band_table1_16x8b and band_table3_16x8b
        // replicating sao_band_pos as 8 bit value 16 times
        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));

        //loaded sao offset values
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);

        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));

        //band_position addition
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
        //sao_offset duplication
        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);

        //sao_offset addition
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);

        //masking upper 8bit values of 16 bit band table value
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
        //temp1_8x16b reuse for compare storage

        switch(sao_band_pos_v)
        {
            case 0:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
                break;
            case 28:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
                break;
            case 29:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
                break;
            case 30:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
                break;
            case 31:
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
                break;
            default:
                break;
        }
        //masking upper 8bit values of each  16 bit band table value
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
        //band table for u and v created
    }
    {
        UWORD8 *pu1_src_cpy;
        WORD32 wd_rem;


        //sao_offset is reused for zero cmp mask.
        sao_offset = _mm_setzero_si128();
        tmp_set_128i_1 = _mm_set1_epi8(1);
        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
        //to avoid ffff to be saturated to 0 instead it should be to ff

        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31

        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);

        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            for(row = ht; row > 0; row -= 2)
            {
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
                // row = 1
                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));


                //odd values
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
                //even values
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
                //combining odd values
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
                //combining even values
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);

                //saturated substract 8 bit
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
                //if the values less than 0 put ff
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
                //if the values greater than 31 put ff
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
                // registers reused to increase performance
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);

                //values 16 to 31 for row 0 & 1 but values <16 ==0
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
                // values 0 to 15 for row 0 & 1
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
                //values 16 to 31 for row 2 & 3 but values <16 ==0
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
                // values 0 to 15 for row 2 & 3
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);

                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);


                //to choose which pixel values to preserve in row 0 and row 1
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
                //to choose which pixel values to preserve in row 2 and row 3
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
                //values of all rows to which no offset needs to be added preserved.
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);

                //indexing 0 - 15 bandtable indexes
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
                //indexing 16 -31 bandtable indexes
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
                // combining all offsets results
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
                // combing results with the pixel values
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
                //reorganising even and odd values
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);


                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);


                pu1_src_cpy += (src_strd << 1);

            }
            pu1_src += 16;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_cpy = pu1_src;
            for(row = ht; row > 0; row -= 4)
            {
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
                // row = 1
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
                // row = 3
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
                //row0 and row1 packed and row2 and row3 packed

                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
                //odd values
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
                //even values
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
                //combining odd values
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
                //combining even values
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);

                //saturated substract 8 bit
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
                //if the values less than 0 put ff
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
                //if the values greater than 31 put ff
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
                // registers reused to increase performance
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);

                //values 16 to 31 for row 0 & 1 but values <16 ==0
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
                // values 0 to 15 for row 0 & 1
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
                //values 16 to 31 for row 2 & 3 but values <16 ==0
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
                // values 0 to 15 for row 2 & 3
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);

                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);


                //to choose which pixel values to preserve in row 0 and row 1
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
                //to choose which pixel values to preserve in row 2 and row 3
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
                //values of all rows to which no offset needs to be added preserved.
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);

                //indexing 0 - 15 bandtable indexes
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
                //indexing 16 -31 bandtable indexes
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
                // combining all offsets results
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
                // combing results with the pixel values
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
                //reorganising even and odd values
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
                //Getting row1 separately
                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
                //Getting row3 separately
                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);

                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
                // row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);

                pu1_src_cpy += (src_strd << 2);

            }
            pu1_src += 16;
        }


    }
}



void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_src_left,
                                        UWORD8 *pu1_src_top,
                                        UWORD8 *pu1_src_top_left,
                                        UWORD8 *pu1_src_top_right,
                                        UWORD8 *pu1_src_bot_left,
                                        UWORD8 *pu1_avail,
                                        WORD8 *pi1_sao_offset,
                                        WORD32 wd,
                                        WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
    UWORD8 u1_avail0, u1_avail1;
    WORD32 wd_rem;
    WORD32 offset = 0;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i left0_16x8b, left1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i left_store_16x8b;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    au1_mask8x16b = _mm_set1_epi8(0xff);

    /* Update  top and top-left arrays */

    *pu1_src_top_left = pu1_src_top[wd - 1];

    for(col = wd; col >= 16; col -= 16)
    {
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
        offset += 16;
    }

    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);

    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;

    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();
    pu1_src_left_cpy = au1_src_left_tmp;
    pu1_src_left_str = au1_src_left_tmp1;
    {
        au1_mask_cpy = au1_mask;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
            //pu1_src_left_cpy =au1_src_left_tmp;
            for(row = ht; row > 0; row -= 2)
            {

                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
                // row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));

                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
                //row 1 left
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
                //row 0 left
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);


                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);

                //row = 0 right
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
                // row = 1 right
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);


                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            au1_mask_cpy += 16;
            pu1_src += 16;
            pu1_src_left_cpy -= ht;
            pu1_src_left_str -= ht;

            pu1_left_tmp = pu1_src_left_cpy;
            pu1_src_left_cpy = pu1_src_left_str;
            pu1_src_left_str = pu1_left_tmp;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {

            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);

            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
            pu1_src_cpy = pu1_src;
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
            //pu1_src_left_cpy =au1_src_left_tmp;
            for(row = ht; row > 0; row -= 4)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
                // row = 1
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row  = 2
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
                // row = 3
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));


                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
                //row 3 left
                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
                //row 2 left
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
                //row 1 left
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
                //row 0 left
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);

                // packing rows together for 16 SIMD operations
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
                // packing rows together for 16 SIMD operations
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);

                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);

                //row = 0 right
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
                // row = 1 right
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
                // row = 2 right
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
                // row = 3 right
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
                // packing rows together for 16 SIMD operations
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);

                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //shuffle to get sao offset
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);

                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
                //separting row 1 and row 3
                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
                // row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);

                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            pu1_src += wd;
            pu1_src_left_cpy -= ht;
            pu1_src_left_str -= ht;

            pu1_left_tmp = pu1_src_left_cpy;
            pu1_src_left_cpy = pu1_src_left_str;
            pu1_src_left_str = pu1_left_tmp;
        }
        for(row = 0; row < ht; row++)
        {
            pu1_src_left[row] = pu1_src_left_cpy[row];
        }
    }
}


void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
                                               WORD32 src_strd,
                                               UWORD8 *pu1_src_left,
                                               UWORD8 *pu1_src_top,
                                               UWORD8 *pu1_src_top_left,
                                               UWORD8 *pu1_src_top_right,
                                               UWORD8 *pu1_src_bot_left,
                                               UWORD8 *pu1_avail,
                                               WORD8 *pi1_sao_offset_u,
                                               WORD8 *pi1_sao_offset_v,
                                               WORD32 wd,
                                               WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
    UWORD8 u1_avail0, u1_avail1;
    WORD32 wd_rem;
    WORD32 offset = 0;

    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i left0_16x8b, left1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i left_store_16x8b;
    __m128i chroma_offset_8x16b;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    au1_mask8x16b = _mm_set1_epi8(0xff);

    /* Update  top and top-left arrays */
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];;

    for(col = wd; col >= 16; col -= 16)
    {
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
        offset += 16;
    }
    for(row = 0; row < 2 * ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);

    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[1] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;
    au1_mask[wd - 2] = u1_avail1;
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();

    {
        pu1_src_left_cpy = au1_src_left_tmp;
        pu1_src_left_str = au1_src_left_tmp1;
        au1_mask_cpy = au1_mask;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);

            for(row = ht; row > 0; row -= 2)
            {

                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
                // row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));

                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
                //row 1 left
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
                //row 0 left
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);


                //separating +ve and and -ve values.row 0 left
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //separating +ve and and -ve values.row 1 left
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);


                //row = 0 right
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
                // row = 1 right
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
                //separating +ve and and -ve values.row 0 right
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //separating +ve and and -ve values.row 1 right
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            au1_mask_cpy += 16;
            pu1_src += 16;
            pu1_src_left_cpy -= 2 * ht;
            pu1_src_left_str -= 2 * ht;

            pu1_left_tmp = pu1_src_left_cpy;
            pu1_src_left_cpy = pu1_src_left_str;
            pu1_src_left_str = pu1_left_tmp;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {

            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);

            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
            pu1_src_cpy = pu1_src;
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);

            for(row = ht; row > 0; row -= 4)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
                // row = 1
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row  = 2
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
                // row = 3
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));


                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
                //row 3 left
                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
                //row 2 left
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);


                // packing rows together for 16 SIMD operations
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);

                //row 1 left
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
                //row 0 left
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
                // packing rows together for 16 SIMD operations
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);

                //separating +ve and and -ve values.for row 2 and row 3
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);





                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);


                //row = 0 right
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
                // row = 1 right
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
                // row = 2 right
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
                // row = 3 right
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
                // packing rows together for 16 SIMD operations
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);

                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //shuffle to get sao offset
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);

                //seaprting row 1 and row 3
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                // row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 8;
                pu1_src_left_str += 8;
            }
            pu1_src += wd;
            pu1_src_left_cpy -= 2 * ht;
            pu1_src_left_str -= 2 * ht;

            pu1_left_tmp = pu1_src_left_cpy;
            pu1_src_left_cpy = pu1_src_left_str;
            pu1_src_left_str = pu1_left_tmp;
        }
        for(row = 0; row < 2 * ht; row++)
        {
            pu1_src_left[row] = pu1_src_left_cpy[row];
        }
    }

}


void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_src_left,
                                        UWORD8 *pu1_src_top,
                                        UWORD8 *pu1_src_top_left,
                                        UWORD8 *pu1_src_top_right,
                                        UWORD8 *pu1_src_bot_left,
                                        UWORD8 *pu1_avail,
                                        WORD8 *pi1_sao_offset,
                                        WORD32 wd,
                                        WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy;
    UWORD8 *pu1_src_cpy;
    WORD32 wd_rem;


    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;

    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);


    /* Updating left and top-left  */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
    }
    *pu1_src_top_left = pu1_src_top[wd - 1];



    pu1_src_top_cpy = pu1_src_top;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);

    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();

    {
        WORD32 ht_rem;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

            for(row = ht; row >= 2; row -= 2)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));


                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);

                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);

                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);

                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and botton and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
            for(row = ht; row >= 4; row -= 4)
            {
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row = 3
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
                // row = 4
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
                //separating +ve and and -ve values.(3,4)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
                //combining sign-left and sign_right
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)

                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)

                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //the next top already in  src_top_16x8b
                //src_top_16x8b = src_temp1_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);

            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
                src_top_16x8b = src_temp1_16x8b;
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);

            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);

            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;
        }
    }
}

void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
                                               WORD32 src_strd,
                                               UWORD8 *pu1_src_left,
                                               UWORD8 *pu1_src_top,
                                               UWORD8 *pu1_src_top_left,
                                               UWORD8 *pu1_src_top_right,
                                               UWORD8 *pu1_src_bot_left,
                                               UWORD8 *pu1_avail,
                                               WORD8 *pi1_sao_offset_u,
                                               WORD8 *pi1_sao_offset_v,
                                               WORD32 wd,
                                               WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy;
    UWORD8 *pu1_src_cpy;
    WORD32 wd_rem;


    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i chroma_offset_8x16b;

    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    /* Updating left and top and top-left */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
    }
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];



    pu1_src_top_cpy = pu1_src_top;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();


    {
        WORD32 ht_rem;



        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

            for(row = ht; row >= 2; row -= 2)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));


                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);

                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);

                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;


                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and botton and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
            for(row = ht; row >= 4; row -= 4)
            {
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row = 3
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
                // row = 4
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
                //separating +ve and and -ve values.(3,4)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
                //combining sign-left and sign_right
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)

                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)

                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);

            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
                src_top_16x8b = src_temp1_16x8b;

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);

                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);

            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {

                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));

                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                src_top_16x8b = src_temp0_16x8b;

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);

            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;
        }
    }
}

/* 135 degree filtering */
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_src_left,
                                        UWORD8 *pu1_src_top,
                                        UWORD8 *pu1_src_top_left,
                                        UWORD8 *pu1_src_top_right,
                                        UWORD8 *pu1_src_bot_left,
                                        UWORD8 *pu1_avail,
                                        WORD8 *pi1_sao_offset,
                                        WORD32 wd,
                                        WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
    UWORD8 *pu1_firstleft;
    UWORD8 *pu1_src_cpy, *pu1_src_org;
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
    WORD32 wd_rem;
    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
    WORD32 ht_tmp, ht_0;

    WORD32 bit_depth;
    UWORD8 u1_avail0, u1_avail1;

    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i left_store_16x8b;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    ht_0 = ht; ht_tmp = ht;
    au1_mask8x16b = _mm_set1_epi8(0xff);

    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    bit_depth = BIT_DEPTH_LUMA;
    pu1_src_org = pu1_src;
    pu1_src_top_cpy = pu1_src_top;
    pu1_src_left_cpy2 = au1_src_left_tmp;
    pu1_src_left_cpy = au1_src_left_tmp;
    pu1_src_left_str2 = au1_src_left_tmp1;
    pu1_src_left_str = au1_src_left_tmp1;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);


    /* If top-left is available, process separately */
    if(0 != pu1_avail[4])
    {
        WORD8 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp = pu1_src[0];
        }
    }
    else
    {
        u1_pos_0_0_tmp = pu1_src[0];
    }

    /* If bottom-right is available, process separately */
    if(0 != pu1_avail[7])
    {
        WORD8 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
    }
    pu1_firstleft = pu1_src_top_left;

    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_firstleft = pu1_src_left_cpy2;
        pu1_src_left_cpy2++;
        pu1_src_left_str2++;
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
        ht_0--;
    }
    //storing top left in a mmx register
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
    //update top -left
    *pu1_src_top_left = pu1_src_top[wd - 1];
    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;
    {
        WORD32 ht_rem;


        pu1_src_left_cpy = pu1_src_left_cpy2;
        pu1_src_left_str = pu1_src_left_str2;
        au1_mask_cpy = au1_mask;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
            //loading the mask
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);


            for(row = ht; row >= 2; row -= 2)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 1 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
                //to insert left in row 0
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                // row = 2 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)


                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);

                //storing the row 1 left for next row.
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);

                //combining sign-left and sign_right
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
                //manipulation for bottom - row 1
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                //bottom - row1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration bottom -row1
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);

                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and botton and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);

                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                pu1_src_left_cpy += 1;
                pu1_src_left_str += 1;
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
                pu1_src_left_str[0] = pu1_src_cpy[15];
            }
            if(0 == pu1_avail[2])
            {
                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
            }

            //for the top left of next part of the block
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
            au1_mask_cpy += 16;


            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //preparing au1_mask
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);

            for(row = ht; row >= 4; row -= 4)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                //right row1
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //manipulation for row 1 -row 0
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 0 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //right row2
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //manipulation for row 2 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //row 1 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
                //row = 3
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));

                // row = 4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty

                //separating +ve and and -ve values.(2,1)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //manipulation for row 3 -row 2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 2 left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
                //combining the appropriate sign change
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)

                //separating +ve and and -ve values.(3,2)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //right row3
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)

                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //right row 4
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                //separating +ve and and -ve values.(3,bottom)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)

                //manipulation for bottom -row 3
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //eliminating old left for row 0,1,2,3
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //row 3 left
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
                //loading row 3 right into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
                //adding bottom and top values of row 2 and row 3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
                //separating +ve and and -ve values.(botttom,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //to store right of row 2
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration

                //storing right of row 2into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //to store right of row 0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);

                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row 1
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //manipulation for row 1 -row 0
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign chang
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -bottom
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //manipulation for bottom -row1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //manipulation for bottom- row 1
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //bottom - row 1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //eliminating old left for row 0,1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration signup0_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next

                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //for storing right of row 1
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);

                src_top_16x8b = src_temp1_16x8b;
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //left store manipulation 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
                //row 0 -row1
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                //for row 0 right to put into left store
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
                //filling the left boundary value
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                pu1_src_left_cpy += 1;
                pu1_src_left_str += 1;
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
                pu1_src_left_str[0] = pu1_src_cpy[7];
            }

            if(0 == pu1_avail[2])
            {
                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
            }

            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;
            au1_mask_cpy += 16;

            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
        }
        pu1_src_org[0] = u1_pos_0_0_tmp;
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
        for(row = 0; row < ht_tmp; row++)
        {
            pu1_src_left[row] = pu1_src_left_cpy[row];
        }
    }

}

/* 135 degree filtering */
void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
                                               WORD32 src_strd,
                                               UWORD8 *pu1_src_left,
                                               UWORD8 *pu1_src_top,
                                               UWORD8 *pu1_src_top_left,
                                               UWORD8 *pu1_src_top_right,
                                               UWORD8 *pu1_src_bot_left,
                                               UWORD8 *pu1_avail,
                                               WORD8 *pi1_sao_offset_u,
                                               WORD8 *pi1_sao_offset_v,
                                               WORD32 wd,
                                               WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
    UWORD8 *pu1_firstleft;
    UWORD8 *pu1_src_cpy, *pu1_src_org;
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
    WORD32 wd_rem;
    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
    WORD32 ht_tmp;
    WORD32 ht_0;

    WORD32 bit_depth;
    UWORD8 u1_avail0, u1_avail1;

    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i left_store_16x8b;
    __m128i chroma_offset_8x16b;

    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    ht_0 = ht; ht_tmp = ht;
    au1_mask8x16b = _mm_set1_epi8(0xff);
    /* Updating left and top-left  */
    for(row = 0; row < 2 * ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    bit_depth = BIT_DEPTH_LUMA;
    pu1_src_org = pu1_src;
    pu1_src_top_cpy = pu1_src_top;
    pu1_src_left_cpy2 = au1_src_left_tmp;
    pu1_src_left_cpy = au1_src_left_tmp;
    pu1_src_left_str2 = au1_src_left_tmp1;
    pu1_src_left_str = au1_src_left_tmp1;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);

    /* If top-left is available, process separately */
    if(0 != pu1_avail[4])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp_u = pu1_src[0];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp_v = pu1_src[1];
        }
    }
    else
    {
        u1_pos_0_0_tmp_u = pu1_src[0];
        u1_pos_0_0_tmp_v = pu1_src[1];
    }

    /* If bottom-right is available, process separately */
    if(0 != pu1_avail[7])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
    }
    pu1_firstleft = pu1_src_top_left;

    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_firstleft = pu1_src_left_cpy2;
        pu1_src_left_cpy2 += 2;
        pu1_src_left_str2 += 2;
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
        ht_0--;
    }
    //storing top left in a mmx register
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);

    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[1] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;
    au1_mask[wd - 2] = u1_avail1;

    /* top-left arrays */
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    {
        WORD32 ht_rem;
        au1_mask_cpy = au1_mask;

        pu1_src_left_cpy = pu1_src_left_cpy2;
        pu1_src_left_str = pu1_src_left_str2;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
            //loading the mask
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);


            for(row = ht; row >= 2; row -= 2)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 1 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
                //to insert left in row 0
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                 // row = 2 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)


                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);

                //storing the row 1 left for next row.
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);

                //combining sign-left and sign_right
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
                //manipulation for bottom - row 1
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                //bottom - row1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration bottom -row1
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row1  getting it right for left of next iteration
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;
                //row0  getting its right for left of next iteration.
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);


                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);


                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);

                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and botton and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;
                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);

                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
                pu1_src_left_str[1] = pu1_src_cpy[15];
                pu1_src_left_str[0] = pu1_src_cpy[14];
            }
            if(0 == pu1_avail[2])
            {
                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
            }

            //for the top left of next part of the block
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
            au1_mask_cpy += 16;

            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
        }
        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
            //row = 0
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //preparing au1_mask
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);

            for(row = ht; row >= 4; row -= 4)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                //right row1
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //manipulation for row 1 -row 0
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 0 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //right row2
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //manipulation for row 2 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //row 1 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
                //row = 3
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));

                // row = 4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty

                //separating +ve and and -ve values.(2,1)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //manipulation for row 3 -row 2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 2 left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
                //combining the appropriate sign change
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)

                //separating +ve and and -ve values.(3,2)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //right row3
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)

                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //right row 4
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                //separating +ve and and -ve values.(3,bottom)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)

                //manipulation for bottom -row 3
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
                //eliminating old left for row 0,1,2,3
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //row 3 left
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);

                //adding bottom and top values of row 2 and row 3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
                //separating +ve and and -ve values.(botttom,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration

                //to store right of row 2
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
                //loading row 3 right into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
                //storing right of row 2into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
                //to store right of row 0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);

                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);


                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 8;
                pu1_src_left_str += 8;
            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //row 0 -row 1
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //manipulation for row 1 -row 0
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign chang
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -bottom
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //manipulation for bottom -row1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //eliminating old left for row 0,1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                //manipulation for bottom- row 1
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //bottom - row 1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //shifting row 1
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration signup0_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //the next top  in  src_top_16x8b
                src_top_16x8b = src_temp1_16x8b;
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);


                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);

                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));

                //row 0 -row1
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);

                //for row 0 right to put into left store
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //left store manipulation 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                src_top_16x8b = src_temp0_16x8b;
                //filling the left boundary value
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);


                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
                pu1_src_left_str[1] = pu1_src_cpy[7];
                pu1_src_left_str[0] = pu1_src_cpy[6];
            }

            if(0 == pu1_avail[2])
            {
                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
            }

            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;

            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
        }
        pu1_src_org[0] = u1_pos_0_0_tmp_u;
        pu1_src_org[1] = u1_pos_0_0_tmp_v;
        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
        for(row = 0; row < 2 * ht_tmp; row++)
        {
            pu1_src_left[row] = pu1_src_left_cpy[row];
        }
    }

}

void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        UWORD8 *pu1_src_left,
                                        UWORD8 *pu1_src_top,
                                        UWORD8 *pu1_src_top_left,
                                        UWORD8 *pu1_src_top_right,
                                        UWORD8 *pu1_src_bot_left,
                                        UWORD8 *pu1_avail,
                                        WORD8 *pi1_sao_offset,
                                        WORD32 wd,
                                        WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
    UWORD8 *pu1_src_cpy, *pu1_src_org;
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    WORD32 wd_rem;
    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
    WORD32 ht_tmp;
    WORD32 bit_depth;
    UWORD8 u1_avail0, u1_avail1;

    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i const2_16x8b, const0_16x8b;
    __m128i left_store_16x8b;

    ht_tmp = ht;
    au1_mask8x16b = _mm_set1_epi8(0xff);

    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
    //manipulation for bottom left
    for(row = 1; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    au1_src_left_tmp[ht] = pu1_src_bot_left[0];

    *pu1_src_top_left = pu1_src_top[wd - 1];
    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    bit_depth = BIT_DEPTH_LUMA;
    pu1_src_org = pu1_src;
    pu1_src_top_cpy = pu1_src_top;
    pu1_src_left_cpy2 = au1_src_left_tmp;
    pu1_src_left_cpy = au1_src_left_tmp;
    pu1_src_left_str2 = au1_src_left_tmp1;
    pu1_src_left_str = au1_src_left_tmp1;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);

    /* If top-right is available, process separately */
    if(0 != pu1_avail[5])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
        }
    }
    else
    {
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
    }

    /* If bottom-left is available, process separately */
    if(0 != pu1_avail[6])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
    }



    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src_left_cpy2++;
        pu1_src_left_str2++;
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }


    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();


    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;
    {
        WORD32 ht_rem;

        pu1_src_left_cpy = pu1_src_left_cpy2;
        pu1_src_left_str = pu1_src_left_str2;
        au1_mask_cpy = au1_mask;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));

            //loading the mask
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

            for(row = ht; row >= 2; row -= 2)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
                // row = 0 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));

                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);

                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);

                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                // row = 1 right
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)

                //bottom - row1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration bottom -row1
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
                //manipulation for row 1 - bottom
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);

                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);

                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);

                //row1  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);

                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and bottom and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);

                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_left_cpy++;
                pu1_src_left_str++;
            }
            {   //for bottom right
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            //for the top left of next part of the block
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
            au1_mask_cpy += 16;

            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
        }

        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_cpy = pu1_src;
            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //preparing au1_mask
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);

            for(row = ht; row >= 4; row -= 4)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                //manipulation for row 0 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //row 1 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulatiing for row 1 -row 0
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //manipulation for row 1 -row 2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
                //row 2 left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)

                //row 1 right
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
                //row = 3
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));

                // row = 4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty

                //separating +ve and and -ve values.(2,1)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 2 right
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
                //combining the appropriate sign change
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)

                //separating +ve and and -ve values.(3,2)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 2 -row 3
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //row 3 left
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)

                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //manipulation for row 3 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                //separating +ve and and -ve values.(3,bottom)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)


                //eliminating old left for row 0,1,2,3
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //row 3 right
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
                //loading row 3 right into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
                //adding bottom and top values of row 2 and row 3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
                //separating +ve and and -ve values.(botttom,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //to store right of row 2
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration

                //storing right of row 2into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //to store right of row 0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);

                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 4;
                pu1_src_left_str += 4;
            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //manipulation for row 0 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign chang
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //manipulation for row 1 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
                //manipulation for bottom- row 1 (row 1 right)
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //bottom - row 1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //eliminating old left for row 0,1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration signup0_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next

                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
                //for storing right of row 1
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);

                src_top_16x8b = src_temp1_16x8b;
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 2;
                pu1_src_left_str += 2;
            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));


                //manipulation for row 0 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                //for row 0 right to put into left store
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
                //left store manipulation 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
                //filling the left boundary value
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                src_top_16x8b = src_temp0_16x8b;
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_left_cpy++;
                pu1_src_left_str++;
            }
            {   //for bottom right
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;

            pu1_left_tmp = pu1_src_left_cpy2;
            pu1_src_left_cpy2 = pu1_src_left_str2;
            pu1_src_left_str2 = pu1_left_tmp;

            pu1_src_left_cpy = pu1_src_left_cpy2;
            pu1_src_left_str = pu1_src_left_str2;

        }
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
        pu1_src_left[0] = au1_src_left_tmp[0];
        for(row = 1; row < ht_tmp; row++)
        {
            pu1_src_left[row] = pu1_src_left_cpy[row];
        }
    }

}

void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
                                               WORD32 src_strd,
                                               UWORD8 *pu1_src_left,
                                               UWORD8 *pu1_src_top,
                                               UWORD8 *pu1_src_top_left,
                                               UWORD8 *pu1_src_top_right,
                                               UWORD8 *pu1_src_bot_left,
                                               UWORD8 *pu1_avail,
                                               WORD8 *pi1_sao_offset_u,
                                               WORD8 *pi1_sao_offset_v,
                                               WORD32 wd,
                                               WORD32 ht)
{
    WORD32 row, col;
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
    UWORD8 *pu1_src_cpy, *pu1_src_org;
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    WORD32 wd_rem;
    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
    WORD32 ht_tmp;
    WORD32 bit_depth;
    UWORD8 u1_avail0, u1_avail1;

    __m128i src_top_16x8b, src_bottom_16x8b;
    __m128i src_temp0_16x8b, src_temp1_16x8b;
    __m128i signup0_16x8b, signdwn1_16x8b;
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
    __m128i edge0_16x8b, edge1_16x8b;
    __m128i au1_mask8x16b;
    __m128i edge_idx_8x16b, sao_offset_8x16b;
    __m128i left_store_16x8b;
    __m128i const0_16x8b, const2_16x8b;
    __m128i chroma_offset_8x16b;

    ht_tmp = ht;
    au1_mask8x16b = _mm_set1_epi8(0xff);


    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
    //manipulation for bottom left
    for(row = 2; row < 2 * ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src_left[row];
    }
    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];

    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    //setting availability mask to ff size MAX_CTB_SIZE
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    bit_depth = BIT_DEPTH_LUMA;
    pu1_src_org = pu1_src;
    pu1_src_top_cpy = pu1_src_top;
    pu1_src_left_cpy2 = au1_src_left_tmp;
    pu1_src_left_cpy = au1_src_left_tmp;
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
    /* If top-right is available, process separately */
    if(0 != pu1_avail[5])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
        }
    }
    else
    {
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
    }

    /* If bottom-left is available, process separately */
    if(0 != pu1_avail[6])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);

        edge_idx = gi1_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
        }
    }
    else
    {
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
    }



    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src_left_cpy2 += 2;
        pu1_src_top_cpy = pu1_src;
        pu1_src += src_strd;
        ht--;
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
    const2_16x8b = _mm_set1_epi8(2);
    const0_16x8b = _mm_setzero_si128();


    //availability mask creation
    u1_avail0 = pu1_avail[0];
    u1_avail1 = pu1_avail[1];
    au1_mask[0] = u1_avail0;
    au1_mask[1] = u1_avail0;
    au1_mask[wd - 1] = u1_avail1;
    au1_mask[wd - 2] = u1_avail1;
    {
        WORD32 ht_rem;
        au1_mask_cpy = au1_mask;
        for(col = wd; col >= 16; col -= 16)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));

            //loading the mask
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            pu1_src_left_cpy = pu1_src_left_cpy2;

            for(row = ht; row >= 2; row -= 2)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
                //row = 1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
                // row = 0 right
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));

                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);

                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
                //combining sign-left and sign_right
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);

                // row = 2
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                // row = 1 right
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)

                //bottom - row1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration bottom -row1
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
                //manipulation for row 1 - bottom
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);

                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //combining sign-left and sign_right
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);

                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                //row1  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                //copying the next top
                src_top_16x8b = src_temp1_16x8b;


                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);

                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);

                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);

                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 4;
            }
            ht_rem = ht & 0x1;

            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                //to insert left in row 1
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);

                //current row -next row
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and bottom and constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                //eliminating old left for row 0 and row 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                //row0  getting it right for left of next block
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                //copying the next top
                src_top_16x8b = src_temp0_16x8b;

                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);

                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);


                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                //store left boundary
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_left_cpy += 2;
            }
            {   //for bottom right
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }
            //for the top left of next part of the block
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
            //updating top flag
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 16;
            au1_mask_cpy += 16;
        }
        pu1_src_left_cpy = pu1_src_left_cpy2;
        wd_rem = wd & 0xF;
        if(wd_rem)
        {
            pu1_src_cpy = pu1_src;
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
            //row = 0
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
            //separating +ve and and -ve values.
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
            //creating mask 00 for +ve and -ve values and FF for zero.
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
            //preparing au1_mask
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
            //combining the appropriate sign change
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
            pu1_src_left_cpy = pu1_src_left_cpy2;
            for(row = ht; row >= 4; row -= 4)
            {
                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
                //manipulation for row 0 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //row 1 left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
                //row 0 -row1
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulatiing for row 1 -row 0
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //row 1 -row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row1-row0
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //manipulation for row 1 -row 2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
                //row 2 left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
                //packing row 0 n row 1
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
                //row1 -row2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)

                //row 1 right
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
                //row = 3
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));

                // row = 4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));

                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty

                //separating +ve and and -ve values.(2,1)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //row 2 right
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
                //combining the appropriate sign change
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)

                //separating +ve and and -ve values.(3,2)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 2 -row 3
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
                //row 3 left
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)

                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)

                //separating +ve and and -ve values.(2,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //manipulation for row 3 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)

                //separating +ve and and -ve values.(3,bottom)
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)


                //eliminating old left for row 0,1,2,3
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
                //packing row 2 n row 3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
                //row 3 right
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
                //loading row 3 right into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
                //adding bottom and top values of row 2 and row 3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
                //separating +ve and and -ve values.(botttom,3)
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
                //to store right of row 2
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration

                //storing right of row 2into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
                //to store right of row 0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);


                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);

                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                //row = 2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
                // row = 3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);

                src_temp0_16x8b = src_temp1_16x8b;
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
                pu1_src_cpy += (src_strd << 2);
                pu1_src_left_cpy += 8;
            }
            ht_rem = ht & 0x2;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
                // row = 2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));

                //manipulation for row 0 -row 1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //manipulation for row 1 - row 0
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //row1-row0
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign chang
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);

                //manipulation for row 1 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);

                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
                //row1 -bottom
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);

                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)

                //manipulation for bottom- row 1 (row 1 right)
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
                //bottom - row 1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);

                //eliminating old left for row 0,1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //for the next iteration signup0_16x8b
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next

                //storing right of row 1 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
                //for storing right of row 1
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);

                src_top_16x8b = src_temp1_16x8b;
                //storing right of row 0 into left
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);

                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
                //the next top already in  src_top_16x8b
                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);

                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);

                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                // row = 1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_cpy += (src_strd << 1);
                pu1_src_left_cpy += 4;
            }
            ht_rem = ht & 0x1;
            if(ht_rem)
            {
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));


                //manipulation for row 0 -bottom
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
                //bottom left
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
                //separating +ve and and -ve values.
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
                //creating mask 00 for +ve and -ve values and FF for zero.
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
                //combining the appropriate sign change
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
                //adding top and down substraction
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
                //for row 0 right to put into left store
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                //adding constant 2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
                //left store manipulation 1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                //filling the left boundary value
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
                src_top_16x8b = src_temp0_16x8b;

                //shuffle to get sao index
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
                //using availability mask
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
                //adding chroma offset to access U and V
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
                //shuffle to get sao offset
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);

                //cnvert to 16 bit then add and then saturated pack
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
                pu1_src_cpy += (src_strd);
                src_temp0_16x8b = src_bottom_16x8b;
                pu1_src_left_cpy += 2;
            }
            {   //for bottom right
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
            }
            if(0 == pu1_avail[3])
            {
                src_top_16x8b = src_bottom_16x8b;
            }

            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
            pu1_src += 8;
        }
        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
        for(row = 0; row < 2 * ht_tmp; row++)
        {
            pu1_src_left[row] = au1_src_left_tmp[row];
        }
    }

}