@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

@******************************************************************************
@*
@* @brief
@*  This file contains definitions of routines for spatial filter
@*
@* @author
@*  Ittiam
@*
@* @par List of Functions:
@*  - ideint_cac_8x8_a9()
@*
@* @remarks
@*  None
@*
@*******************************************************************************


@******************************************************************************
@*
@*  @brief Calculates Combing Artifact
@*
@*  @par   Description
@*   This functions calculates combing artifact check (CAC) for given two fields
@*
@* @param[in] pu1_top
@*  UWORD8 pointer to top field
@*
@* @param[in] pu1_bot
@*  UWORD8 pointer to bottom field
@*
@* @param[in] top_strd
@*  Top field stride
@*
@* @param[in] bot_strd
@*  Bottom field stride
@*
@* @returns
@*  None
@*
@* @remarks
@*
@******************************************************************************

    .global ideint_cac_8x8_a9

ideint_cac_8x8_a9:

    stmfd       sp!,    {r4-r10, lr}

    @ Load first row of top
    vld1.u8     d28,    [r0],   r2

    @ Load first row of bottom
    vld1.u8     d29,    [r1],   r3

    @ Load second row of top
    vld1.u8     d30,    [r0],   r2

    @ Load second row of bottom
    vld1.u8     d31,    [r1],   r3


    @ Calculate row based adj and alt values
    @ Get row sums
    vpaddl.u8   q0,     q14

    vpaddl.u8   q1,     q15

    vpaddl.u16  q0,     q0

    vpaddl.u16  q1,     q1

    @ Both q0 and q1 have four 32 bit sums corresponding to first 4 rows
    @ Pack q0 and q1 into a single register (sum does not exceed 16bits)

    vshl.u32    q8,     q1,     #16
    vorr.u32    q8,     q0,     q8
    @ q8 now contains 8 sums

    @ Load third row of top
    vld1.u8     d24,    [r0],   r2

    @ Load third row of bottom
    vld1.u8     d25,    [r1],   r3

    @ Load fourth row of top
    vld1.u8     d26,    [r0],   r2

    @ Load fourth row of bottom
    vld1.u8     d27,    [r1],   r3

    @ Get row sums
    vpaddl.u8   q2,     q12

    vpaddl.u8   q3,     q13

    vpaddl.u16  q2,     q2

    vpaddl.u16  q3,     q3
    @ Both q2 and q3 have four 32 bit sums corresponding to last 4 rows
    @ Pack q2 and q3 into a single register (sum does not exceed 16bits)

    vshl.u32    q9,     q3,     #16
    vorr.u32    q9,     q2,     q9
    @ q9 now contains 8 sums

    @ Compute absolute diff between top and bottom row sums
    vabd.u16    d16,    d16,    d17
    vabd.u16    d17,    d18,    d19

    @ RSUM_CSUM_THRESH
    vmov.u16    q9,     #20

    @ Eliminate values smaller than RSUM_CSUM_THRESH
    vcge.u16    q10,    q8,     q9
    vand.u16    q10,    q8,     q10
    @ q10 now contains 8 absolute diff of sums above the threshold


    @ Compute adj
    vadd.u16    d20,    d20,    d21

    @ d20 has four adj values for two sub-blocks

    @ Compute alt
    vabd.u32    q0,     q0,     q1
    vabd.u32    q2,     q2,     q3

    vadd.u32    q0,     q0,     q2
    vadd.u32    d21,    d0,     d1
    @ d21 has two values for two sub-blocks


    @ Calculate column based adj and alt values

    vrhadd.u8   q0,     q14,    q15
    vrhadd.u8   q1,     q12,    q13
    vrhadd.u8   q0,     q0,     q1

    vabd.u8     d0,     d0,     d1

    @ RSUM_CSUM_THRESH >> 2
    vmov.u8     d9,     #5

    @ Eliminate values smaller than RSUM_CSUM_THRESH >> 2
    vcge.u8     d1,     d0,     d9
    vand.u8     d0,     d0,     d1
    @ d0 now contains 8 absolute diff of sums above the threshold


    vpaddl.u8   d0,     d0
    vshl.u16    d0,     d0,     #2

    @ Add row based adj
    vadd.u16    d20,    d0,     d20

    vpaddl.u16  d20,    d20
    @ d20 now contains 2 adj values


    vrhadd.u8   d0,     d28,    d29
    vrhadd.u8   d2,     d24,    d25
    vrhadd.u8   d0,     d0,     d2

    vrhadd.u8   d1,     d30,    d31
    vrhadd.u8   d3,     d26,    d27
    vrhadd.u8   d1,     d1,     d3

    vabd.u8     d0,     d0,     d1
    vpaddl.u8   d0,     d0

    vshl.u16    d0,     d0,     #2
    vpaddl.u16  d0,     d0
    vadd.u32    d21,    d0,     d21


    @ d21 now contains 2 alt values

    @ SAD_BIAS_MULT_SHIFT
    vshr.u32    d0,     d21,    #3
    vadd.u32    d21,    d21,    d0

    @ SAD_BIAS_ADDITIVE >> 1
    vmov.u32    d0,     #4
    vadd.u32    d21,    d21,    d0

    vclt.u32    d0,     d21,    d20
    vpaddl.u32  d0,     d0

    vmov.u32    r0,     d0[0]
    cmp         r0,     #0
    movne       r0,     #1
    ldmfd       sp!,    {r4-r10, pc}