//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/

//******************************************************************************
//*
//* @brief
//*  This file contains definitions of routines for spatial filter
//*
//* @author
//*  Ittiam
//*
//* @par List of Functions:
//*  - ideint_cac_8x8_av8()
//*
//* @remarks
//*  None
//*
//*******************************************************************************


//******************************************************************************
//*
//*  @brief Calculates Combing Artifact
//*
//*  @par   Description
//*   This functions calculates combing artifact check (CAC) for given two fields
//*
//* @param[in] pu1_top
//*  UWORD8 pointer to top field
//*
//* @param[in] pu1_bot
//*  UWORD8 pointer to bottom field
//*
//* @param[in] top_strd
//*  Top field stride
//*
//* @param[in] bot_strd
//*  Bottom field stride
//*
//* @returns
//*     None
//*
//* @remarks
//*
//******************************************************************************

    .global ideint_cac_8x8_av8

ideint_cac_8x8_av8:

    // Load first row of top
    ld1     {v28.8b},       [x0],       x2

    // Load first row of bottom
    ld1     {v29.8b},       [x1],       x3
    mov     v28.d[1],       v29.d[0]

    // Load second row of top
    ld1     {v30.8b},       [x0],       x2

    // Load second row of bottom
    ld1     {v31.8b},       [x1],       x3
    mov     v30.d[1],       v31.d[0]


    // Calculate row based adj and alt values
    // Get row sums
    uaddlp  v0.8h,          v28.16b

    uaddlp  v2.8h,          v30.16b

    uaddlp  v0.4s,          v0.8h

    uaddlp  v2.4s,          v2.8h

    // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows
    // Pack v0 and v2 into a single register (sum does not exceed 16bits)

    shl     v16.4s,         v2.4s,      #16
    orr     v16.16b,        v0.16b,     v16.16b
    // v16 now contains 8 sums

    // Load third row of top
    ld1     {v24.8b},       [x0],       x2

    // Load third row of bottom
    ld1     {v25.8b},       [x1],       x3
    mov     v24.d[1],       v25.d[0]

    // Load fourth row of top
    ld1     {v26.8b},       [x0],       x2

    // Load fourth row of bottom
    ld1     {v27.8b},       [x1],       x3
    mov     v26.d[1],       v27.d[0]

    // Get row sums
    uaddlp  v4.8h,          v24.16b

    uaddlp  v6.8h,          v26.16b

    uaddlp  v4.4s,          v4.8h

    uaddlp  v6.4s,          v6.8h
    // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows
    // Pack v4 and v6 into a single register (sum does not exceed 16bits)

    shl     v18.4s,         v6.4s,      #16
    orr     v18.16b,        v4.16b,     v18.16b
    // v18 now contains 8 sums

    // Compute absolute diff between top and bottom row sums
    mov     v17.d[0],       v16.d[1]
    uabd    v16.4h,         v16.4h,     v17.4h

    mov     v19.d[0],       v18.d[1]
    uabd    v17.4h,         v18.4h,     v19.4h

    mov     v16.d[1],       v17.d[0]

    // RSUM_CSUM_THRESH
    movi    v18.8h,         #20

    // Eliminate values smaller than RSUM_CSUM_THRESH
    cmhs    v20.8h,         v16.8h,     v18.8h
    and     v20.16b,        v16.16b,    v20.16b

    // v20 now contains 8 absolute diff of sums above the threshold

    // Compute adj
    mov     v21.d[0],       v20.d[1]
    add     v20.4h,         v20.4h,     v21.4h

    // v20 has four adj values for two sub-blocks

    // Compute alt
    uabd    v0.4s,      v0.4s,      v2.4s
    uabd    v4.4s,      v4.4s,      v6.4s

    add     v0.4s,      v0.4s,      v4.4s

    mov     v1.d[0],    v0.d[1]
    add     v21.4s,     v0.4s,      v1.4s
    // d21 has two values for two sub-blocks


    // Calculate column based adj and alt values

    urhadd  v0.16b,     v28.16b,    v30.16b
    urhadd  v2.16b,     v24.16b,    v26.16b
    urhadd  v0.16b,     v0.16b,     v2.16b

    mov     v1.d[0],    v0.d[1]
    uabd    v0.8b,      v0.8b,      v1.8b

    // RSUM_CSUM_THRESH >> 2
    movi    v22.16b,        #5

    // Eliminate values smaller than RSUM_CSUM_THRESH >> 2
    cmhs    v1.16b,      v0.16b,        v22.16b
    and     v0.16b,      v0.16b,        v1.16b
    // d0 now contains 8 absolute diff of sums above the threshold


    uaddlp  v0.4h,      v0.8b
    shl     v0.4h,      v0.4h,#2

    // Add row based adj
    add     v20.4h,     v0.4h,      v20.4h

    uaddlp  v20.2s,     v20.4h
    // d20 now contains 2 adj values


    urhadd  v0.8b,      v28.8b,     v29.8b
    urhadd  v2.8b,      v24.8b,     v25.8b
    urhadd  v0.8b,      v0.8b,      v2.8b

    urhadd  v1.8b,      v30.8b,     v31.8b
    urhadd  v3.8b,      v26.8b,     v27.8b
    urhadd  v1.8b,      v1.8b,      v3.8b

    uabd    v0.8b,      v0.8b,      v1.8b
    uaddlp  v0.4h,      v0.8b

    shl     v0.4h,      v0.4h,      #2
    uaddlp  v0.2s,      v0.4h
    add     v21.2s,     v0.2s,      v21.2s


    // d21 now contains 2 alt values

    // SAD_BIAS_MULT_SHIFT
    ushr    v0.2s,      v21.2s,     #3
    add     v21.2s,     v21.2s,     v0.2s

    // SAD_BIAS_ADDITIVE >> 1
    movi    v0.2s,      #4
    add     v21.2s,     v21.2s,     v0.2s

    cmhi    v0.2s,      v20.2s,     v21.2s
    uaddlp  v0.1d,      v0.2s

    smov    x0,         v0.2s[0]
    cmp     x0,         #0
    mov     x4,         #1
    csel    x0,         x4,         x0,         ne
    ret