/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/**
******************************************************************************
* @file ime_distortion_metrics_sse42.c
*
* @brief
*  This file contains definitions of routines that compute distortion
*  between two macro/sub blocks of identical dimensions
*
* @author
*  Ittiam
*
* @par List of Functions:
*  - ime_compute_sad_16x16_sse42()
*  - ime_compute_sad_16x16_fast_sse42()
*  - ime_compute_sad_16x16_ea8_sse42()
*  - ime_compute_sad_16x8_sse42()
*  - ime_calculate_sad4_prog_sse42()
*  - ime_sub_pel_compute_sad_16x16_sse42()
*  - ime_compute_satqd_16x16_lumainter_sse42()
*
* @remarks
*  None
*
*******************************************************************************
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* User include files */
#include "ime_typedefs.h"
#include "ime_defs.h"
#include "ime_macros.h"
#include "ime_statistics.h"
#include "ime_platform_macros.h"
#include "ime_distortion_metrics.h"
#include <immintrin.h>

/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/

/**
******************************************************************************
*
* @brief computes distortion (SAD) between 2 16x16 blocks
*
* @par   Description
*   This functions computes SAD between 2 16x16 blocks. There is a provision
*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] i4_max_sad
*  integer maximum allowed distortion
*
* @param[out] pi4_mb_distortion
*  integer evaluated sad
*
* @remarks
*
******************************************************************************
*/
void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
                           UWORD8 *pu1_est,
                           WORD32 src_strd,
                           WORD32 est_strd,
                           WORD32 i4_max_sad,
                           WORD32 *pi4_mb_distortion)
{
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i est_r0, est_r1, est_r2, est_r3;
    __m128i res_r0, res_r1, res_r2, res_r3;
    __m128i sad_val;
    int val1, val2;
    UNUSED (i4_max_sad);

    // Row 0-3 sad calculation
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(res_r0, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 4-7 sad calculation
    pu1_src += 4*src_strd;
    pu1_est += 4*est_strd;

    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 8-11 sad calculation
    pu1_src += 4*src_strd;
    pu1_est += 4*est_strd;
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 12-15 sad calculation
    pu1_src += 4*src_strd;
    pu1_est += 4*est_strd;
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    val1 = _mm_extract_epi32(sad_val,0);
    val2 = _mm_extract_epi32(sad_val, 2);
    *pi4_mb_distortion = (val1+val2);

    return;
}

/**
******************************************************************************
*
*  @brief computes distortion (SAD) between 2 16x8  blocks
*
*
*  @par   Description
*   This functions computes SAD between 2 16x8 blocks. There is a provision
*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] u4_max_sad
*  integer maximum allowed distortion
*
* @param[out] pi4_mb_distortion
*  integer evaluated sad
*
* @remarks
*
******************************************************************************
*/
void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
                    UWORD8 *pu1_est,
                    WORD32 src_strd,
                    WORD32 est_strd,
                    WORD32 i4_max_sad,
                    WORD32 *pi4_mb_distortion)
{
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i est_r0, est_r1, est_r2, est_r3;
    __m128i res_r0, res_r1, res_r2, res_r3;
    __m128i sad_val;
    int val1, val2;
    UNUSED (i4_max_sad);

    // Row 0-3 sad calculation
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(res_r0, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 4-7 sad calculation
    pu1_src += 4*src_strd;
    pu1_est += 4*est_strd;

    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    val1 = _mm_extract_epi32(sad_val,0);
    val2 = _mm_extract_epi32(sad_val, 2);
    *pi4_mb_distortion = (val1+val2);
    return;
}

/**
******************************************************************************
*
* @brief computes distortion (SAD) between 2 16x16 blocks
*
* @par   Description
*   This functions computes SAD between 2 16x16 blocks. There is a provision
*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] i4_max_sad
*  integer maximum allowed distortion
*
* @param[out] pi4_mb_distortion
*  integer evaluated sad
*
* @remarks
*
******************************************************************************
*/
void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
                               UWORD8 *pu1_est,
                               WORD32 src_strd,
                               WORD32 est_strd,
                               WORD32 i4_max_sad,
                               WORD32 *pi4_mb_distortion)
{
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i est_r0, est_r1, est_r2, est_r3;
    __m128i res_r0, res_r1, res_r2, res_r3;
    __m128i sad_val;
    WORD32 val1, val2;
    WORD32 i4_sad;
    UWORD8 *pu1_src_temp = pu1_src + src_strd;
    UWORD8 *pu1_est_temp = pu1_est + est_strd;

    // Row 0,2,4,6 sad calculation
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(res_r0, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 8,10,12,14 sad calculation
    pu1_src += 8*src_strd;
    pu1_est += 8*est_strd;

    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    pu1_src = pu1_src_temp;
    pu1_est = pu1_est_temp;

    val1 = _mm_extract_epi32(sad_val, 0);
    val2 = _mm_extract_epi32(sad_val, 2);

    i4_sad = val1 + val2;
    if (i4_max_sad < i4_sad)
    {
        *pi4_mb_distortion = i4_sad;
        return ;
    }
    // Row 1,3,5,7 sad calculation
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 9,11,13,15 sad calculation
    pu1_src += 8*src_strd;
    pu1_est += 8*est_strd;
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    val1 = _mm_extract_epi32(sad_val, 0);
    val2 = _mm_extract_epi32(sad_val, 2);
    *pi4_mb_distortion = (val1+val2);

    return;
}

/**
******************************************************************************
*
* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
*
* @par   Description
*   This functions computes SAD between 2 16x16 blocks by processing alternate
*   rows (fast mode). For fast mode it is assumed sad obtained by processing
*   alternate rows is approximately twice as that for the whole block.
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] i4_max_sad
*  integer maximum allowed distortion
*
* @param[out] pi4_mb_distortion
*  integer evaluated sad
*
* @remarks
*
******************************************************************************
*/
void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
                                UWORD8 *pu1_est,
                                WORD32 src_strd,
                                WORD32 est_strd,
                                WORD32 i4_max_sad,
                                WORD32 *pi4_mb_distortion)
{
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i est_r0, est_r1, est_r2, est_r3;
    __m128i res_r0, res_r1, res_r2, res_r3;
    __m128i sad_val;
    WORD32 val1, val2;
    WORD32 i4_sad;
    UWORD8 *pu1_src_temp = pu1_src + src_strd;
    UWORD8 *pu1_est_temp = pu1_est + est_strd;
    UNUSED (i4_max_sad);

    // Row 0,2,4,6 sad calculation
    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(res_r0, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    // Row 8,10,12,14 sad calculation
    pu1_src += 8 * src_strd;
    pu1_est += 8 * est_strd;

    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));

    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));

    res_r0 = _mm_sad_epu8(src_r0, est_r0);
    res_r1 = _mm_sad_epu8(src_r1, est_r1);
    res_r2 = _mm_sad_epu8(src_r2, est_r2);
    res_r3 = _mm_sad_epu8(src_r3, est_r3);

    sad_val = _mm_add_epi64(sad_val, res_r0);
    sad_val = _mm_add_epi64(sad_val, res_r1);
    sad_val = _mm_add_epi64(sad_val, res_r2);
    sad_val = _mm_add_epi64(sad_val, res_r3);

    pu1_src = pu1_src_temp;
    pu1_est = pu1_est_temp;

    val1 = _mm_extract_epi32(sad_val, 0);
    val2 = _mm_extract_epi32(sad_val, 2);

    i4_sad = val1 + val2;
    *pi4_mb_distortion = (i4_sad<<1);
    return;
}

/**
*******************************************************************************
*
* @brief compute sad
*
* @par Description: This function computes the sad at vertices of diamond grid
* centered at reference pointer and at unit distance from it.
*
* @param[in] pu1_ref
*  UWORD8 pointer to the reference
*
* @param[out] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] ref_strd
*  integer reference stride
*
* @param[in] src_strd
*  integer source stride
*
* @param[out] pi4_sad
*  pointer to integer array evaluated sad
*
* @returns  sad at all evaluated vertexes
*
* @remarks  none
*
*******************************************************************************
*/
void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
                             UWORD8 *pu1_src,
                             WORD32 ref_strd,
                             WORD32 src_strd,
                             WORD32 *pi4_sad)
{
    /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
    UWORD8 *left_ptr    = pu1_ref - 1;
    UWORD8 *right_ptr   = pu1_ref + 1;
    UWORD8 *top_ptr     = pu1_ref - ref_strd;
    UWORD8 *bot_ptr     = pu1_ref + ref_strd;

    WORD32 val1, val2;
    __m128i src, ref_left, ref_right, ref_top, ref_bot;
    __m128i res_r0, res_r1, res_r2, res_r3;
    __m128i sad_r0, sad_r1, sad_r2, sad_r3;

    // Row 0 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    sad_r0 = _mm_sad_epu8(src, ref_left);
    sad_r1 = _mm_sad_epu8(src, ref_right);
    sad_r2 = _mm_sad_epu8(src, ref_top);
    sad_r3 = _mm_sad_epu8(src, ref_bot);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 1 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 2 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 3 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 4 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 5 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 6 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 7 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 8 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 9 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 10 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 11 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 12 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 13 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 14 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    pu1_src += src_strd;
    left_ptr += ref_strd;
    right_ptr += ref_strd;
    top_ptr += ref_strd;
    bot_ptr += ref_strd;

    // Row 15 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));

    res_r0 = _mm_sad_epu8(src, ref_left);
    res_r1 = _mm_sad_epu8(src, ref_right);
    res_r2 = _mm_sad_epu8(src, ref_top);
    res_r3 = _mm_sad_epu8(src, ref_bot);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);

    val1 = _mm_extract_epi32(sad_r0, 0);
    val2 = _mm_extract_epi32(sad_r0, 2);
    pi4_sad[0] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r1, 0);
    val2 = _mm_extract_epi32(sad_r1, 2);
    pi4_sad[1] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r2, 0);
    val2 = _mm_extract_epi32(sad_r2, 2);
    pi4_sad[2] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r3, 0);
    val2 = _mm_extract_epi32(sad_r3, 2);
    pi4_sad[3] = (val1 + val2);
}

/**
******************************************************************************
*
* @brief computes distortion (SAD) at all subpel points about the src location
*
* @par Description
*   This functions computes SAD at all points at a subpel distance from the
*   current source location.
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_ref_half_x
*  UWORD8 pointer to half pel buffer
*
* @param[out] pu1_ref_half_y
*  UWORD8 pointer to half pel buffer
*
* @param[out] pu1_ref_half_xy
*  UWORD8 pointer to half pel buffer
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] ref_strd
*  integer ref stride
*
* @param[out] pi4_sad
*  integer evaluated sad
*  pi4_sad[0] - half x
*  pi4_sad[1] - half x - 1
*  pi4_sad[2] - half y
*  pi4_sad[3] - half y - 1
*  pi4_sad[4] - half xy
*  pi4_sad[5] - half xy - 1
*  pi4_sad[6] - half xy - strd
*  pi4_sad[7] - half xy - 1 - strd
*
* @remarks
*
******************************************************************************
*/
void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
                                   UWORD8 *pu1_ref_half_x,
                                   UWORD8 *pu1_ref_half_y,
                                   UWORD8 *pu1_ref_half_xy,
                                   WORD32 src_strd,
                                   WORD32 ref_strd,
                                   WORD32 *pi4_sad)
{
    UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
    UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
    UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
    UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
    UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
    WORD32 val1, val2;

    __m128i src, ref_half_x, ref_half_y, ref_half_xy;
    __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
    __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
    __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
    // Row 0 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    sad_r0 = _mm_sad_epu8(src, ref_half_x);
    sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
    sad_r2 = _mm_sad_epu8(src, ref_half_y);
    sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
    sad_r4 = _mm_sad_epu8(src, ref_half_xy);
    sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 1 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 2 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 3 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 4 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;


    // Row 5 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 6 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 7 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 8 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 9 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 10 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 11 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 12 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 13 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 14 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    pu1_src += src_strd;
    pu1_ref_half_x += ref_strd;
    pu1_ref_half_x_left += ref_strd;
    pu1_ref_half_y += ref_strd;
    pu1_ref_half_y_top += ref_strd;
    pu1_ref_half_xy += ref_strd;
    pu1_ref_half_xy_left += ref_strd;
    pu1_ref_half_xy_top += ref_strd;
    pu1_ref_half_xy_top_left += ref_strd;

    // Row 15 sad calculation
    src = _mm_loadu_si128((__m128i *) (pu1_src));
    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));

    res_r0 = _mm_sad_epu8(src, ref_half_x);
    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
    res_r2 = _mm_sad_epu8(src, ref_half_y);
    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
    res_r4 = _mm_sad_epu8(src, ref_half_xy);
    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);

    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
    sad_r7 = _mm_add_epi64(sad_r7, res_r7);

    val1 = _mm_extract_epi32(sad_r0, 0);
    val2 = _mm_extract_epi32(sad_r0, 2);
    pi4_sad[0] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r1, 0);
    val2 = _mm_extract_epi32(sad_r1, 2);
    pi4_sad[1] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r2, 0);
    val2 = _mm_extract_epi32(sad_r2, 2);
    pi4_sad[2] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r3, 0);
    val2 = _mm_extract_epi32(sad_r3, 2);
    pi4_sad[3] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r4, 0);
    val2 = _mm_extract_epi32(sad_r4, 2);
    pi4_sad[4] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r5, 0);
    val2 = _mm_extract_epi32(sad_r5, 2);
    pi4_sad[5] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r6, 0);
    val2 = _mm_extract_epi32(sad_r6, 2);
    pi4_sad[6] = (val1 + val2);

    val1 = _mm_extract_epi32(sad_r7, 0);
    val2 = _mm_extract_epi32(sad_r7, 2);
    pi4_sad[7] = (val1 + val2);

    return;
}
/*
*
* @brief This function computes SAD between two 16x16 blocks
*        It also computes if the block will be zero after H264 transform and quant for
*        Intra 16x16 blocks
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[out] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] pu2_thrsh
*  Threshold for each element of transofrmed quantized block
*
* @param[out] pi4_mb_distortion
*  integer evaluated sad
*
* @param[out] pu4_is_zero
*  Poitner to store if the block is zero after transform and quantization
*
* @remarks
*
******************************************************************************
*/
void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
                                         UWORD8 *pu1_est,
                                         WORD32 src_strd,
                                         WORD32 est_strd,
                                         UWORD16 *pu2_thrsh,
                                         WORD32 *pi4_mb_distortion,
                                         UWORD32 *pu4_is_zero)
{
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i est_r0, est_r1, est_r2, est_r3;
    __m128i temp0, temp1, temp2, temp3, temp4;
    __m128i zero = _mm_setzero_si128();          // all bits reset to zero
    __m128i all_one = _mm_set1_epi8(0xFF);
    __m128i sad_b1, sad_b2, threshold;
    WORD16 sad_1, sad_2;
    WORD32 i;
    UWORD32 flag = 0;
    WORD32 test1, test2;
    threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
    (*pi4_mb_distortion) = 0;

    for (i=0; i<4; i++)
    {
        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2

        src_r0 = _mm_cvtepu8_epi16(src_r0);
        src_r1 = _mm_cvtepu8_epi16(src_r1);
        src_r2 = _mm_cvtepu8_epi16(src_r2);
        src_r3 = _mm_cvtepu8_epi16(src_r3);

        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));

        est_r0 = _mm_cvtepu8_epi16(est_r0);
        est_r1 = _mm_cvtepu8_epi16(est_r1);
        est_r2 = _mm_cvtepu8_epi16(est_r2);
        est_r3 = _mm_cvtepu8_epi16(est_r3);

        src_r0 = _mm_sub_epi16(src_r0, est_r0);
        src_r1 = _mm_sub_epi16(src_r1, est_r1);
        src_r2 = _mm_sub_epi16(src_r2, est_r2);
        src_r3 = _mm_sub_epi16(src_r3, est_r3);

        src_r0 = _mm_abs_epi16(src_r0);
        src_r1 = _mm_abs_epi16(src_r1);
        src_r2 = _mm_abs_epi16(src_r2);
        src_r3 = _mm_abs_epi16(src_r3);

        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2

        //SAD calculation
        temp0 = _mm_add_epi16(src_r0, src_r1);      //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
        temp0 = _mm_hadd_epi16(temp0, zero);
        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values

        sad_1 = _mm_extract_epi16(temp0, 0);
        sad_2 = _mm_extract_epi16(temp0, 1);

        (*pi4_mb_distortion) += sad_1 + sad_2;

        if (flag == 0) {
            sad_b1 = _mm_set1_epi16((sad_1 << 1));
            sad_b2 = _mm_set1_epi16((sad_2 << 1));

            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4

            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3

            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0

            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0

            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0

            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0

            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0

            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0

            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0

            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)

            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)

            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1

            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff

            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);

            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
            temp1 = _mm_xor_si128(temp1, all_one);

            test1 = _mm_test_all_zeros(temp0, all_one);
            test2 = _mm_test_all_zeros(temp1, all_one);

            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
                    || pu2_thrsh[8] <= sad_2)
                flag = 1;
        }

        pu1_src += 8;
        pu1_est += 8;

        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2

        src_r0 = _mm_cvtepu8_epi16(src_r0);
        src_r1 = _mm_cvtepu8_epi16(src_r1);
        src_r2 = _mm_cvtepu8_epi16(src_r2);
        src_r3 = _mm_cvtepu8_epi16(src_r3);

        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));

        est_r0 = _mm_cvtepu8_epi16(est_r0);
        est_r1 = _mm_cvtepu8_epi16(est_r1);
        est_r2 = _mm_cvtepu8_epi16(est_r2);
        est_r3 = _mm_cvtepu8_epi16(est_r3);

        src_r0 = _mm_sub_epi16(src_r0, est_r0);
        src_r1 = _mm_sub_epi16(src_r1, est_r1);
        src_r2 = _mm_sub_epi16(src_r2, est_r2);
        src_r3 = _mm_sub_epi16(src_r3, est_r3);

        src_r0 = _mm_abs_epi16(src_r0);
        src_r1 = _mm_abs_epi16(src_r1);
        src_r2 = _mm_abs_epi16(src_r2);
        src_r3 = _mm_abs_epi16(src_r3);

        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2

        //SAD calculation
        temp0 = _mm_add_epi16(src_r0, src_r1);
        temp0 = _mm_hadd_epi16(temp0, zero);
        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values

        sad_1 = _mm_extract_epi16(temp0, 0);
        sad_2 = _mm_extract_epi16(temp0, 1);

        (*pi4_mb_distortion) += sad_1 + sad_2;

        if (flag == 0) {
            sad_b1 = _mm_set1_epi16((sad_1 << 1));
            sad_b2 = _mm_set1_epi16((sad_2 << 1));

            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4

            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3

            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0

            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0

            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0

            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0

            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0

            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0

            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0

            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)

            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)

            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1

            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff

            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);

            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
            temp1 = _mm_xor_si128(temp1, all_one);

            test1 = _mm_test_all_zeros(temp0, all_one);
            test2 = _mm_test_all_zeros(temp1, all_one);

            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
                    || pu2_thrsh[8] <= sad_2)
                flag = 1;
        }

        pu1_src += 4*src_strd - 8;
        pu1_est += 4*est_strd - 8;
    }

        *pu4_is_zero = flag;
}