/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
*  ideint_cac_ssse3.c
*
* @brief
*  This file include the definitions of the combing  artifact check function
* of the de-interlacer and some  variant of that.
*
* @author
*  Ittiam
*
* @par List of Functions:
*  cac_4x8()
*  ideint_cac()
*
* @remarks
*  In the de-interlacer workspace, cac is not a seperate  assembly module as
* it comes along with the  de_int_decision() function. But in C-Model, to
* keep  the things cleaner, it was made to be a separate  function during
* cac experiments long after the  assembly was written by Mudit.
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>

/* User include files */
#include "icv_datatypes.h"
#include "icv_macros.h"
#include "icv.h"
#include "icv_variance.h"
#include "icv_sad.h"
#include "ideint.h"
#include "ideint_defs.h"
#include "ideint_structs.h"
#include "ideint_cac.h"

/**
*******************************************************************************
*
* @brief
* Combing artifact check function for 8x8 block
*
* @par   Description
* Determines CAC for 8x8 block by calling 8x4 CAC function
*
* @param[in] pu1_top
*  Top field
*
* @param[in] pu1_bot
*  Bottom field
*
* @param[in] top_strd
*  Top field Stride
*
* @param[in] bot_strd
*  Bottom field stride
*
* @returns
* combing artifact flag (1 = detected, 0 = not detected)
*
* @remarks
*
*******************************************************************************
*/
WORD32 ideint_cac_8x8_ssse3(UWORD8 *pu1_top,
                            UWORD8 *pu1_bot,
                            WORD32 top_strd,
                            WORD32 bot_strd)
{
    WORD32 ca;        /* combing artifact result                          */
    WORD32 i;
    WORD32 adj[2] = {0};
    WORD32 alt[2] = {0};
    WORD32 sum_1, sum_2, sum_3, sum_4;
    WORD32 sum_diff, diff_sum;

    __m128i top[4];
    __m128i bot[4];
    __m128i sum_t[4];
    __m128i sum_b[4];
    __m128i zero;


    zero = _mm_setzero_si128();

    for(i = 0; i < 4; i++)
    {
        /* Load top */
        top[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_top));
        pu1_top += top_strd;

        /* Load bottom */
        bot[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_bot));
        pu1_bot += bot_strd;

        /* Unpack */
        top[i] = _mm_unpacklo_epi8(top[i], zero);
        bot[i] = _mm_unpacklo_epi8(bot[i], zero);

        /* Compute row sums */
        sum_t[i]  = _mm_sad_epu8(top[i], zero);
        sum_b[i]  = _mm_sad_epu8(bot[i], zero);
    }

    /* Compute row based alt and adj */
    for(i = 0; i < 4; i += 2)
    {
        sum_1 = _mm_cvtsi128_si32(sum_t[i + 0]);
        sum_2 = _mm_cvtsi128_si32(sum_b[i + 0]);
        sum_diff = ABS_DIF(sum_1, sum_2);
        if(sum_diff >= RSUM_CSUM_THRESH)
            adj[0] += sum_diff;

        sum_3 = _mm_cvtsi128_si32(sum_t[i + 1]);
        sum_4 = _mm_cvtsi128_si32(sum_b[i + 1]);
        sum_diff = ABS_DIF(sum_3, sum_4);
        if(sum_diff >= RSUM_CSUM_THRESH)
            adj[0] += sum_diff;

        alt[0] += ABS_DIF(sum_1, sum_3);
        alt[0] += ABS_DIF(sum_2, sum_4);

        sum_1 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 0], 8));
        sum_2 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 0], 8));
        sum_diff = ABS_DIF(sum_1, sum_2);
        if(sum_diff >= RSUM_CSUM_THRESH)
            adj[1] += sum_diff;

        sum_3 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 1], 8));
        sum_4 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 1], 8));
        sum_diff = ABS_DIF(sum_3, sum_4);
        if(sum_diff >= RSUM_CSUM_THRESH)
            adj[1] += sum_diff;

        alt[1] += ABS_DIF(sum_1, sum_3);
        alt[1] += ABS_DIF(sum_2, sum_4);
    }

    /* Compute column based adj */
    {
        __m128i avg1, avg2;
        __m128i top_avg, bot_avg;
        __m128i min, max, diff, thresh;
        __m128i mask;
        avg1 = _mm_avg_epu8(top[0], top[1]);
        avg2 = _mm_avg_epu8(top[2], top[3]);
        top_avg = _mm_avg_epu8(avg1, avg2);

        avg1 = _mm_avg_epu8(bot[0], bot[1]);
        avg2 = _mm_avg_epu8(bot[2], bot[3]);
        bot_avg = _mm_avg_epu8(avg1, avg2);

        min = _mm_min_epu8(top_avg, bot_avg);
        max = _mm_max_epu8(top_avg, bot_avg);

        diff = _mm_sub_epi16(max, min);
        thresh = _mm_set1_epi16((RSUM_CSUM_THRESH >> 2) - 1);

        mask = _mm_cmpgt_epi16(diff, thresh);
        diff = _mm_and_si128(diff, mask);

        diff_sum = _mm_extract_epi16(diff, 0);
        diff_sum += _mm_extract_epi16(diff, 1);
        diff_sum += _mm_extract_epi16(diff, 2);
        diff_sum += _mm_extract_epi16(diff, 3);

        adj[0] += diff_sum << 2;

        diff_sum = _mm_extract_epi16(diff, 4);
        diff_sum += _mm_extract_epi16(diff, 5);
        diff_sum += _mm_extract_epi16(diff, 6);
        diff_sum += _mm_extract_epi16(diff, 7);

        adj[1] += diff_sum << 2;

    }

    /* Compute column based alt */
    {
        __m128i avg1, avg2;
        __m128i even_avg, odd_avg, diff;
        avg1 = _mm_avg_epu8(top[0], bot[0]);
        avg2 = _mm_avg_epu8(top[2], bot[2]);
        even_avg = _mm_avg_epu8(avg1, avg2);

        avg1 = _mm_avg_epu8(top[1], bot[1]);
        avg2 = _mm_avg_epu8(top[3], bot[3]);
        odd_avg = _mm_avg_epu8(avg1, avg2);

        diff = _mm_sad_epu8(even_avg, odd_avg);


        diff_sum = _mm_cvtsi128_si32(diff);
        alt[0] += diff_sum << 2;

        diff_sum = _mm_cvtsi128_si32(_mm_srli_si128(diff, 8));
        alt[1] += diff_sum << 2;

    }
    alt[0] += (alt[0] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1);
    alt[1] += (alt[1] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1);

    ca    = (alt[0] < adj[0]);
    ca   |= (alt[1] < adj[1]);

    return ca;
}