//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
//**

///**
//******************************************************************************
//*
//*
//* @brief
//*  This file contains definitions of routines that compute distortion
//*  between two macro/sub blocks of identical dimensions
//*
//* @author
//*  Ittiam
//*
//* @par List of Functions:
//*  - ime_compute_sad_16x16()
//*  - ime_compute_sad_8x8()
//*  - ime_compute_sad_4x4()
//*  - ime_compute_sad_16x8()
//*  - ime_compute_satqd_16x16_lumainter_av8()
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//


///**
//******************************************************************************
//*
//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
//*
//* @par   Description
//*   This functions computes SAD between 2 16x16 blocks. There is a provision
//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] i4_max_sad
//*  integer maximum allowed distortion
//*
//* @param[in] pi4_mb_distortion
//*  integer evaluated sad
//*
//* @remarks
//*
//******************************************************************************
//*/
.text
.p2align 2

.macro push_v_regs
    stp       d8, d9, [sp, #-16]!
    stp       d10, d11, [sp, #-16]!
    stp       d12, d13, [sp, #-16]!
    stp       d14, d15, [sp, #-16]!
.endm
.macro pop_v_regs
    ldp       d14, d15, [sp], #16
    ldp       d12, d13, [sp], #16
    ldp       d10, d11, [sp], #16
    ldp       d8, d9, [sp], #16
.endm

    .global ime_compute_sad_16x16_fast_av8
ime_compute_sad_16x16_fast_av8:
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    lsl       x2, x2, #1
    lsl       x3, x3, #1

    mov       x6, #2
    movi      v30.8h, #0

core_loop_ime_compute_sad_16x16_fast_av8:

    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3

    uabal     v30.8h, v0.8b, v1.8b
    uabal2    v30.8h, v0.16b, v1.16b

    uabal     v30.8h, v2.8b, v3.8b
    uabal2    v30.8h, v2.16b, v3.16b

    ld1       {v4.16b}, [x0], x2
    ld1       {v5.16b}, [x1], x3
    ld1       {v6.16b}, [x0], x2
    ld1       {v7.16b}, [x1], x3

    uabal     v30.8h, v4.8b, v5.8b
    uabal2    v30.8h, v4.16b, v5.16b

    uabal     v30.8h, v6.8b, v7.8b
    uabal2    v30.8h, v6.16b, v7.16b

    subs      x6, x6, #1
    bne       core_loop_ime_compute_sad_16x16_fast_av8


    addp      v30.8h, v30.8h, v30.8h
    uaddlp    v30.4s, v30.8h
    addp      v30.2s, v30.2s, v30.2s
    shl       v30.2s, v30.2s, #1

    st1       {v30.s}[0], [x5]
    pop_v_regs
    ret


///**
//******************************************************************************
//*
//*  @brief computes distortion (SAD) between 2 16x8  blocks
//*
//*
//*  @par   Description
//*   This functions computes SAD between 2 16x8 blocks. There is a provision
//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] u4_max_sad
//*  integer maximum allowed distortion
//*
//* @param[in] pi4_mb_distortion
//*  integer evaluated sad
//*
//* @remarks
//*
//******************************************************************************
//*/
//
    .global ime_compute_sad_16x8_av8
ime_compute_sad_16x8_av8:

    //chheck what stride incremtn to use
    //earlier code did not have this lsl
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    mov       x6, #2
    movi      v30.8h, #0

core_loop_ime_compute_sad_16x8_av8:

    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3

    uabal     v30.8h, v0.8b, v1.8b
    uabal2    v30.8h, v0.16b, v1.16b

    uabal     v30.8h, v2.8b, v3.8b
    uabal2    v30.8h, v2.16b, v3.16b

    ld1       {v4.16b}, [x0], x2
    ld1       {v5.16b}, [x1], x3
    ld1       {v6.16b}, [x0], x2
    ld1       {v7.16b}, [x1], x3

    uabal     v30.8h, v4.8b, v5.8b
    uabal2    v30.8h, v4.16b, v5.16b

    uabal     v30.8h, v6.8b, v7.8b
    uabal2    v30.8h, v6.16b, v7.16b

    subs      x6, x6, #1
    bne       core_loop_ime_compute_sad_16x8_av8


    addp      v30.8h, v30.8h, v30.8h
    uaddlp    v30.4s, v30.8h
    addp      v30.2s, v30.2s, v30.2s

    st1       {v30.s}[0], [x5]
    pop_v_regs
    ret

///**
//******************************************************************************
//*
//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
//*
//* @par   Description
//*   This functions computes SAD between 2 16x16 blocks. There is a provision
//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] i4_max_sad
//*  integer maximum allowed distortion
//*
//* @param[in] pi4_mb_distortion
//*  integer evaluated sad
//*
//* @remarks
//*
//******************************************************************************
//*/

    .global ime_compute_sad_16x16_ea8_av8
ime_compute_sad_16x16_ea8_av8:

    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    movi      v30.8h, #0

    add       x7, x0, x2
    add       x8, x1, x3

    lsl       x2, x2, #1
    lsl       x3, x3, #1

    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3
    ld1       {v8.16b}, [x0], x2
    ld1       {v9.16b}, [x1], x3
    ld1       {v10.16b}, [x0], x2
    ld1       {v11.16b}, [x1], x3
    ld1       {v12.16b}, [x0], x2
    ld1       {v13.16b}, [x1], x3
    ld1       {v14.16b}, [x0], x2
    ld1       {v15.16b}, [x1], x3
    ld1       {v16.16b}, [x0], x2
    ld1       {v17.16b}, [x1], x3
    ld1       {v18.16b}, [x0], x2
    ld1       {v19.16b}, [x1], x3

    uabal     v30.8h, v0.8b, v1.8b
    uabal2    v30.8h, v0.16b, v1.16b

    uabal     v30.8h, v2.8b, v3.8b
    uabal2    v30.8h, v2.16b, v3.16b

    uabal     v30.8h, v8.8b, v9.8b
    uabal2    v30.8h, v8.16b, v9.16b

    uabal     v30.8h, v10.8b, v11.8b
    uabal2    v30.8h, v10.16b, v11.16b

    uabal     v30.8h, v12.8b, v13.8b
    uabal2    v30.8h, v12.16b, v13.16b

    uabal     v30.8h, v14.8b, v15.8b
    uabal2    v30.8h, v14.16b, v15.16b

    uabal     v30.8h, v16.8b, v17.8b
    uabal2    v30.8h, v16.16b, v17.16b

    uabal     v30.8h, v18.8b, v19.8b
    uabal2    v30.8h, v18.16b, v19.16b

    addp      v31.8h, v30.8h, v30.8h
    uaddlp    v31.4s, v31.8h
    addp      v31.2s, v31.2s, v31.2s
    mov       w6, v31.s[0]
    cmp       w6, w4
    bgt       end_func_16x16

    //do the stuff again
    ld1       {v0.16b}, [x7], x2
    ld1       {v1.16b}, [x8], x3
    ld1       {v2.16b}, [x7], x2
    ld1       {v3.16b}, [x8], x3
    ld1       {v8.16b}, [x7], x2
    ld1       {v9.16b}, [x8], x3
    ld1       {v10.16b}, [x7], x2
    ld1       {v11.16b}, [x8], x3
    ld1       {v12.16b}, [x7], x2
    ld1       {v13.16b}, [x8], x3
    ld1       {v14.16b}, [x7], x2
    ld1       {v15.16b}, [x8], x3
    ld1       {v16.16b}, [x7], x2
    ld1       {v17.16b}, [x8], x3
    ld1       {v18.16b}, [x7], x2
    ld1       {v19.16b}, [x8], x3

    uabal     v30.8h, v0.8b, v1.8b
    uabal2    v30.8h, v0.16b, v1.16b

    uabal     v30.8h, v2.8b, v3.8b
    uabal2    v30.8h, v2.16b, v3.16b

    uabal     v30.8h, v8.8b, v9.8b
    uabal2    v30.8h, v8.16b, v9.16b

    uabal     v30.8h, v10.8b, v11.8b
    uabal2    v30.8h, v10.16b, v11.16b

    uabal     v30.8h, v12.8b, v13.8b
    uabal2    v30.8h, v12.16b, v13.16b

    uabal     v30.8h, v14.8b, v15.8b
    uabal2    v30.8h, v14.16b, v15.16b

    uabal     v30.8h, v16.8b, v17.8b
    uabal2    v30.8h, v16.16b, v17.16b

    uabal     v30.8h, v18.8b, v19.8b
    uabal2    v30.8h, v18.16b, v19.16b

    addp      v31.8h, v30.8h, v30.8h
    uaddlp    v31.4s, v31.8h
    addp      v31.2s, v31.2s, v31.2s

end_func_16x16:
    st1       {v31.s}[0], [x5]
    pop_v_regs
    ret


///*
////---------------------------------------------------------------------------
//// Function Name      : ime_calculate_sad2_prog_av8()
////
//// Detail Description : This function find the sad values of 4 Progressive MBs
////                        at one shot
////
//// Platform           : CortexAv8/NEON            .
////
////-----------------------------------------------------------------------------
//*/

    .global ime_calculate_sad2_prog_av8
ime_calculate_sad2_prog_av8:

    // x0    = ref1     <UWORD8 *>
    // x1    = ref2     <UWORD8 *>
    // x2    = src     <UWORD8 *>
    // w3    = RefBufferWidth <UWORD32>
    // w4    = CurBufferWidth <UWORD32>
    // x5    = psad <UWORD32 *>
    push_v_regs
    sxtw      x3, w3
    sxtw      x4, w4
    mov       x6, #8
    movi      v30.8h, #0
    movi      v31.8h, #0

core_loop_ime_calculate_sad2_prog_av8:

    ld1       {v0.16b}, [x0], x3
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x3], x4

    ld1       {v3.16b}, [x0], x3
    ld1       {v4.16b}, [x1], x3
    ld1       {v5.16b}, [x3], x4


    uabal     v30.8h, v0.8b, v2.8b
    uabal2    v30.8h, v0.16b, v2.16b
    uabal     v31.8h, v1.8b, v2.8b
    uabal2    v31.8h, v1.16b, v2.16b

    uabal     v30.8h, v3.8b, v5.8b
    uabal2    v30.8h, v3.16b, v5.16b
    uabal     v31.8h, v4.8b, v5.8b
    uabal2    v31.8h, v4.16b, v5.16b


    ld1       {v6.16b}, [x0], x3
    ld1       {v7.16b}, [x1], x3
    ld1       {v8.16b}, [x3], x4

    ld1       {v9.16b}, [x0], x3
    ld1       {v10.16b}, [x1], x3
    ld1       {v11.16b}, [x3], x4

    uabal     v30.8h, v6.8b, v8.8b
    uabal2    v30.8h, v6.16b, v8.16b
    uabal     v31.8h, v7.8b, v8.8b
    uabal2    v31.8h, v7.16b, v8.16b

    uabal     v30.8h, v9.8b, v11.8b
    uabal2    v30.8h, v9.16b, v11.16b
    uabal     v31.8h, v10.8b, v11.8b
    uabal2    v31.8h, v0.16b, v11.16b

    subs      x6, x6, #1
    bne       core_loop_ime_calculate_sad2_prog_av8

    addp      v30.8h, v30.8h, v31.8h
    uaddlp    v30.4s, v30.8h
    addp      v30.2s, v30.2s, v30.2s
    shl       v30.2s, v30.2s, #1

    st1       {v30.2s}, [x5]
    pop_v_regs
    ret

///*
////---------------------------------------------------------------------------
//// Function Name      : Calculate_Mad3_prog()
////
//// Detail Description : This function find the sad values of 4 Progressive MBs
////                        at one shot
////
//// Platform           : CortexA8/NEON            .
////
////-----------------------------------------------------------------------------
//*/

    .global ime_calculate_sad3_prog_av8
ime_calculate_sad3_prog_av8:

    // x0    = ref1     <UWORD8 *>
    // x1    = ref2     <UWORD8 *>
    // x2    = ref3     <UWORD8 *>
    // x3    = src     <UWORD8 *>
    // w4    = RefBufferWidth <UWORD32>
    // w5    = CurBufferWidth <UWORD32>
    // x6    = psad <UWORD32 *>


    push_v_regs
    sxtw      x4, w4
    sxtw      x5, w5
    mov       x7, #16
    movi      v29.8h, #0
    movi      v30.8h, #0
    movi      v31.8h, #0

core_loop_ime_calculate_sad3_prog_av8:

    ld1       {v0.16b}, [x0], x4
    ld1       {v1.16b}, [x1], x4
    ld1       {v2.16b}, [x2], x4
    ld1       {v3.16b}, [x3], x5

    uabal     v29.8h, v0.8b, v3.8b
    uabal2    v29.8h, v0.16b, v3.16b
    uabal     v30.8h, v1.8b, v3.8b
    uabal2    v30.8h, v1.16b, v3.16b
    uabal     v31.8h, v2.8b, v3.8b
    uabal2    v31.8h, v2.16b, v3.16b

    ld1       {v4.16b}, [x0], x4
    ld1       {v5.16b}, [x1], x4
    ld1       {v6.16b}, [x2], x4
    ld1       {v7.16b}, [x3], x5

    uabal     v29.8h, v4.8b, v7.8b
    uabal2    v29.8h, v4.16b, v7.16b
    uabal     v30.8h, v5.8b, v7.8b
    uabal2    v30.8h, v5.16b, v7.16b
    uabal     v31.8h, v6.8b, v7.8b
    uabal2    v31.8h, v6.16b, v7.16b

    subs      x7, x7, #1
    bne       core_loop_ime_calculate_sad3_prog_av8

    addp      v30.8h, v30.8h, v31.8h
    uaddlp    v30.4s, v30.8h
    addp      v30.2s, v30.2s, v30.2s
    shl       v30.2s, v30.2s, #1

    st1       {v30.2s}, [x6]
    pop_v_regs
    ret




///**
//******************************************************************************
//*
//* @brief computes distortion (SAD) for sub-pel motion estimation
//*
//* @par   Description
//*   This functions computes SAD for all the 8 half pel points
//*
//* @param[out] pi4_sad
//*  integer evaluated sad
//*  pi4_sad[0] - half x
//*  pi4_sad[1] - half x - 1
//*  pi4_sad[2] - half y
//*  pi4_sad[3] - half y - 1
//*  pi4_sad[4] - half xy
//*  pi4_sad[5] - half xy - 1
//*  pi4_sad[6] - half xy - strd
//*  pi4_sad[7] - half xy - 1 - strd
//*
//* @remarks
//*
//******************************************************************************
//*/

.text
.p2align 2

    .global ime_sub_pel_compute_sad_16x16_av8
ime_sub_pel_compute_sad_16x16_av8:
    push_v_regs
    sxtw      x4, w4
    sxtw      x5, w5
    sub       x7, x1, #1                //x left
    sub       x8, x2, x5                //y top
    sub       x9, x3, #1                //xy  left
    sub       x10, x3, x5               //xy top
    sub       x11, x10, #1              //xy top left

    movi      v24.8h, #0
    movi      v25.8h, #0
    movi      v26.8h, #0
    movi      v27.8h, #0
    movi      v28.8h, #0
    movi      v29.8h, #0
    movi      v30.8h, #0
    movi      v31.8h, #0

    mov       x12, #16
core_loop_ime_sub_pel_compute_sad_16x16_av8:

    ld1       {v0.16b}, [x0], x4        //src
    ld1       {v1.16b}, [x1], x5        //x
    ld1       {v2.16b}, [x7], x5        //x left
    ld1       {v3.16b}, [x2], x5        //y
    ld1       {v9.16b}, [x8], x5        //y top
    ld1       {v10.16b}, [x3], x5       //xy
    ld1       {v11.16b}, [x9], x5       //xy left
    ld1       {v12.16b}, [x10], x5      //xy top
    ld1       {v13.16b}, [x11], x5      //xy top left

    uabal     v24.8h, v0.8b, v1.8b
    uabal2    v24.8h, v0.16b, v1.16b
    uabal     v25.8h, v0.8b, v2.8b
    uabal2    v25.8h, v0.16b, v2.16b
    uabal     v26.8h, v0.8b, v3.8b
    uabal2    v26.8h, v0.16b, v3.16b
    uabal     v27.8h, v0.8b, v9.8b
    uabal2    v27.8h, v0.16b, v9.16b
    uabal     v28.8h, v0.8b, v10.8b
    uabal2    v28.8h, v0.16b, v10.16b
    uabal     v29.8h, v0.8b, v11.8b
    uabal2    v29.8h, v0.16b, v11.16b
    uabal     v30.8h, v0.8b, v12.8b
    uabal2    v30.8h, v0.16b, v12.16b
    uabal     v31.8h, v0.8b, v13.8b
    uabal2    v31.8h, v0.16b, v13.16b

    subs      x12, x12, #1
    bne       core_loop_ime_sub_pel_compute_sad_16x16_av8

    addp      v24.8h, v24.8h, v25.8h
    addp      v26.8h, v26.8h, v27.8h
    addp      v28.8h, v28.8h, v29.8h
    addp      v30.8h, v30.8h, v31.8h

    uaddlp    v24.4s, v24.8h
    uaddlp    v26.4s, v26.8h
    uaddlp    v28.4s, v28.8h
    uaddlp    v30.4s, v30.8h

    addp      v24.4s, v24.4s, v26.4s
    addp      v25.4s, v28.4s, v30.4s

    st1       {v24.4s-v25.4s}, [x6]


    pop_v_regs
    ret


///**
//******************************************************************************
//*
//* @brief computes distortion (SAD) between 2 16x16 blocks
//*
//* @par   Description
//*   This functions computes SAD between 2 16x16 blocks. There is a provision
//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] i4_max_sad
//*  integer maximum allowed distortion
//*
//* @param[in] pi4_mb_distortion
//*  integer evaluated sad
//*
//* @remarks
//*
//******************************************************************************
//*/
    .global ime_compute_sad_16x16_av8
ime_compute_sad_16x16_av8:
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    mov       x6, #4
    movi      v30.8h, #0

core_loop_ime_compute_sad_16x16_av8:

    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3

    uabal     v30.8h, v0.8b, v1.8b
    uabal2    v30.8h, v0.16b, v1.16b

    uabal     v30.8h, v2.8b, v3.8b
    uabal2    v30.8h, v2.16b, v3.16b

    ld1       {v4.16b}, [x0], x2
    ld1       {v5.16b}, [x1], x3
    ld1       {v6.16b}, [x0], x2
    ld1       {v7.16b}, [x1], x3

    uabal     v30.8h, v4.8b, v5.8b
    uabal2    v30.8h, v4.16b, v5.16b

    uabal     v30.8h, v6.8b, v7.8b
    uabal2    v30.8h, v6.16b, v7.16b

    subs      x6, x6, #1
    bne       core_loop_ime_compute_sad_16x16_av8


    addp      v30.8h, v30.8h, v30.8h
    uaddlp    v30.4s, v30.8h
    addp      v30.2s, v30.2s, v30.2s

    st1       {v30.s}[0], [x5]
    pop_v_regs
    ret


///*
////---------------------------------------------------------------------------
//// Function Name      : Calculate_Mad4_prog()
////
//// Detail Description : This function find the sad values of 4 Progressive MBs
////                        at one shot
////
//// Platform           : CortexA8/NEON            .
////
////-----------------------------------------------------------------------------
//*/

    .global ime_calculate_sad4_prog_av8
ime_calculate_sad4_prog_av8:
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    sub       x5, x0, #1                //left
    add       x6, x0, #1                //right
    sub       x7, x0, x2                //top
    add       x8, x0, x2                //bottom

    movi      v28.8h, #0
    movi      v29.8h, #0
    movi      v30.8h, #0
    movi      v31.8h, #0

    mov       x9, #16
core_loop_ime_calculate_sad4_prog_av8:

    ld1       {v0.16b}, [x1], x3
    ld1       {v1.16b}, [x5], x2
    ld1       {v2.16b}, [x6], x2
    ld1       {v3.16b}, [x7], x2
    ld1       {v9.16b}, [x8], x2

    uabal     v28.8h, v0.8b, v1.8b
    uabal2    v28.8h, v0.16b, v1.16b
    uabal     v29.8h, v0.8b, v2.8b
    uabal2    v29.8h, v0.16b, v2.16b
    uabal     v30.8h, v0.8b, v3.8b
    uabal2    v30.8h, v0.16b, v3.16b
    uabal     v31.8h, v0.8b, v9.8b
    uabal2    v31.8h, v0.16b, v9.16b

    subs      x9, x9, #1
    bne       core_loop_ime_calculate_sad4_prog_av8

    addp      v28.8h, v28.8h, v29.8h
    addp      v30.8h, v30.8h, v31.8h

    uaddlp    v28.4s, v28.8h
    uaddlp    v30.4s, v30.8h

    addp      v28.4s, v28.4s, v30.4s
    st1       {v28.4s}, [x4]
    pop_v_regs
    ret



//*****************************************************************************
//*
//* Function Name         : ime_compute_satqd_16x16_lumainter_av8
//* Description           : This fucntion computes SAD for a 16x16 block.
//                        : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
//
//  Arguments             :   x0 :pointer to src buffer
//                            x1 :pointer to est buffer
//                            x2 :source stride
//                            x3 :est stride
//                            STACk :Threshold,distotion,is_nonzero
//*
//* Values Returned   : NONE
//*
//* Register Usage    : x0-x11
//* Stack Usage       :
//* Cycles            : Around
//* Interruptiaility  : Interruptable
//*
//* Known Limitations
//*   \Assumptions    :
//*
//* Revision History  :
//*         DD MM YYYY    Author(s)           Changes
//*         14 04 2014    Harinarayanan K K  First version
//*
//*****************************************************************************
    .global ime_compute_satqd_16x16_lumainter_av8
ime_compute_satqd_16x16_lumainter_av8:
    //x0 :pointer to src buffer
    //x1 :pointer to est buffer
    //w2 :Source stride
    //w3 :Pred stride
    //x4 :Threshold pointer
    //x5 :Distortion,ie SAD
    //x6 :is nonzero
    //x7 :loop counter
    push_v_regs
    sxtw      x2, w2
    sxtw      x3, w3
    stp       d8, d9, [sp, #-16]!
    stp       d10, d11, [sp, #-16]!
    stp       d12, d13, [sp, #-16]!
    stp       d14, d15, [sp, #-16]!

    ld1       {v30.8h}, [x4]

    dup       v20.4h, v30.h[1]          //ls1
    dup       v24.4h, v30.h[0]          //ls2
    dup       v21.4h, v30.h[5]          //ls3
    dup       v25.4h, v30.h[7]          //ls4
    dup       v22.4h, v30.h[3]          //ls5
    dup       v26.4h, v30.h[4]          //ls6
    dup       v23.4h, v30.h[6]          //ls7
    dup       v27.4h, v30.h[2]          //ls8

    mov       v20.d[1], v24.d[0]
    mov       v21.d[1], v25.d[0]
    mov       v22.d[1], v26.d[0]
    mov       v23.d[1], v27.d[0]

    add       x4, x4, #16
    ld1       {v29.h}[0], [x4]
    dup       v29.4h, v29.h[0]

    movi      v31.8h, #0

    mov       x7, #4
core_loop_satqd_ime_compute_satqd_16x16_lumainter:
    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3
    ld1       {v4.16b}, [x0], x2
    ld1       {v5.16b}, [x1], x3
    ld1       {v6.16b}, [x0], x2
    ld1       {v7.16b}, [x1], x3

    uabdl     v10.8h, v0.8b, v1.8b
    uabdl2    v15.8h, v0.16b, v1.16b
    uabdl     v11.8h, v2.8b, v3.8b
    uabdl2    v16.8h, v2.16b, v3.16b
    uabdl     v12.8h, v4.8b, v5.8b
    uabdl2    v17.8h, v4.16b, v5.16b
    uabdl     v13.8h, v6.8b, v7.8b
    uabdl2    v18.8h, v6.16b, v7.16b

    add       v0.8h, v10.8h, v13.8h
    add       v1.8h, v11.8h, v12.8h
    add       v2.8h, v15.8h, v18.8h
    add       v3.8h, v16.8h, v17.8h

    //v0 : S1     S4     S4     S1        A1    A4    A4    A1
    //v1 : S2     S3     S3     S2        A2    A3    A3    A2
    //v2 : B1     B4     B4     B1        X1    X4    X4    X1
    //v3 : B3     B2     B2     B3        X3    X2    X2    X3

    trn1      v4.8h, v0.8h, v1.8h
    trn2      v5.8h, v0.8h, v1.8h
    trn1      v6.8h, v2.8h, v3.8h
    trn2      v7.8h, v2.8h, v3.8h

    trn1      v0.4s, v4.4s, v6.4s
    trn2      v2.4s, v4.4s, v6.4s
    trn1      v1.4s, v5.4s, v7.4s
    trn2      v3.4s, v5.4s, v7.4s

    add       v4.8h, v0.8h, v3.8h
    add       v5.8h, v1.8h, v2.8h
    //v4 : S1     S2     B1     B2      A1    A2    X1    X2
    //v5 : S4     S3     B4     B3      A4    A3    X4    X3

    //compute sad for each 4x4 block
    add       v6.8h, v4.8h, v5.8h
    addp      v19.8h, v6.8h, v6.8h
    //duplicate the sad into 128 bit so that we can compare using 128bit
    add       v31.4h, v31.4h, v19.4h

    //sad_2 = sad_1<<1;
    shl       v28.8h, v19.8h, #1

    //sad_2 - pu2_thrsh
    sub       v24.8h, v28.8h, v20.8h
    sub       v25.8h, v28.8h, v21.8h
    sub       v26.8h, v28.8h, v22.8h
    sub       v27.8h, v28.8h, v23.8h

    trn1      v0.4s, v4.4s, v5.4s
    trn2      v1.4s, v4.4s, v5.4s
    //v0 : S1     S2     S4     S3      A1    A2    A4    A3
    //v1 : B1     B2     B4     B3      X1    X2    X4    X3

    trn1      v4.8h, v0.8h, v1.8h
    trn2      v5.8h, v0.8h, v1.8h
    //v4 : S1     B1     S4     B4      A1    X1    A4    X4
    //v5 : S2     B2     S3     B3      A2    X2    A3    X3

    mov       v7.s[0], v4.s[1]
    mov       v7.s[1], v4.s[3]
    mov       v6.s[0], v5.s[1]          // V4 //S1 B1 A1 X1
    mov       v6.s[1], v5.s[3]          // V5 //S2 B2 A2 X2
    mov       v4.s[1], v4.s[2]          // V6 //S3 B3 A3 X3
    mov       v5.s[1], v5.s[2]          // V7 //S4 B4 A4 X4

    shl       v0.4h, v4.4h, #1          //S1<<1
    shl       v1.4h, v5.4h, #1          //S2<<1
    shl       v2.4h, v6.4h, #1          //S3<<1
    shl       v3.4h, v7.4h, #1          //S4<<1

    add       v8.4h, v5.4h, v6.4h       //(s2[j] + s3[j]))
    add       v9.4h, v4.4h, v7.4h       //(s1[j] + s4[j]))
    add       v10.4h, v6.4h, v7.4h      //(s3[j] + s4[j]))
    sub       v11.4h, v6.4h, v0.4h      //(s3[j] - (s1[j]<<1))
    sub       v12.4h, v7.4h, v1.4h      //(s4[j] - (s2[j]<<1))
    add       v13.4h, v4.4h, v5.4h      //(s1[j] + s2[j]))
    sub       v14.4h, v5.4h, v3.4h      //(s2[j] - (s4[j]<<1)))
    sub       v15.4h, v4.4h, v2.4h      //(s1[j] - (s3[j]<<1)))

    mov       v8.d[1], v9.d[0]
    mov       v10.d[1], v11.d[0]
    mov       v12.d[1], v13.d[0]
    mov       v14.d[1], v15.d[0]

    cmge      v0.8h, v24.8h, v8.8h      //ls1 ls2
    cmge      v1.8h, v25.8h, v10.8h     //ls3 ls4
    cmge      v2.8h, v26.8h, v12.8h     //ls5 ls6
    cmge      v3.8h, v27.8h, v14.8h     //ls7 ls8
    cmge      v4.4h, v19.4h, v29.4h     //sad

    orr       v0.16b, v0.16b, v1.16b
    orr       v2.16b, v2.16b, v3.16b
    orr       v2.16b, v0.16b, v2.16b
    xtn       v2.8b, v2.8h
    orr       v2.8b, v2.8b, v4.8b

    //if the comparison is non zero, out
    mov       x4, v2.d[0]
    cmp       x4, #0
    bne       core_loop_compute_sad_pre

    subs      x7, x7, #1
    bne       core_loop_satqd_ime_compute_satqd_16x16_lumainter
    b         satdq_end_func


core_loop_compute_sad:
    ld1       {v0.16b}, [x0], x2
    ld1       {v1.16b}, [x1], x3
    ld1       {v2.16b}, [x0], x2
    ld1       {v3.16b}, [x1], x3

    uabal     v31.8h, v0.8b, v1.8b
    uabal2    v31.8h, v0.16b, v1.16b

    uabal     v31.8h, v2.8b, v3.8b
    uabal2    v31.8h, v2.16b, v3.16b

    ld1       {v4.16b}, [x0], x2
    ld1       {v5.16b}, [x1], x3
    ld1       {v6.16b}, [x0], x2
    ld1       {v7.16b}, [x1], x3

    uabal     v31.8h, v4.8b, v5.8b
    uabal2    v31.8h, v4.16b, v5.16b

    uabal     v31.8h, v6.8b, v7.8b
    uabal2    v31.8h, v6.16b, v7.16b

core_loop_compute_sad_pre:
    subs      x7, x7, #1
    bne       core_loop_compute_sad

satdq_end_func:

    mov       x7, #1
    cmp       x4, #0
    csel      x7, x4, x7, eq
    str       w7, [x6]

    addp      v31.8h, v31.8h, v31.8h
    uaddlp    v31.4s, v31.8h
    addp      v31.2s, v31.2s, v31.2s
    st1       {v31.s}[0], [x5]


    ldp       d14, d15, [sp], #16
    ldp       d12, d13, [sp], #16
    ldp       d10, d11, [sp], #16
    ldp       d8, d9, [sp], #16
    pop_v_regs
    ret