///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//*  ihevc_deblk_luma_vert.s
//*
//* //brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* //author
//*  anand s
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************/

.text
.align 4



.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table

.globl ihevc_deblk_luma_vert_av8

.type ihevc_deblk_luma_vert_av8, %function

ihevc_deblk_luma_vert_av8:

    sxtw        x5,w5
    sxtw        x6,w6
    stp         d8,d9,[sp,#-16]!
    stp         d10,d11,[sp,#-16]!
    stp         d12,d13,[sp,#-16]!
    stp         d14,d15,[sp,#-16]!
    stp         x19, x20,[sp,#-16]!
    stp         x21, x22,[sp,#-16]!
    mov         x21,x7
    ldr         w22,[sp,#96]
    add         x3,x3,x4
    add         x3,x3,#1
    asr         x3,x3,#1
    add         x7,x3,x5,lsl #1
    add         x3,x3,x6,lsl #1
    cmp         x7,#0x33
    mov         x20,#0x33
    csel        x7, x20, x7,gt
    bgt         l1.56
    cmp         x7,#0x0
    mov         x20,#0x0
    csel        x7, x20, x7,lt              // x7 has the beta_index value
l1.56:

//     bic      x2,x2,#1
    asr         x2,x2,#1

    add         x3,x3,x2,lsl #1
    cmp         x3,#0x35
    mov         x20,#0x35
    csel        x3, x20, x3,gt
    bgt         l1.88
    cmp         x3,#0x0
    mov         x20,#0x0
    csel        x3, x20, x3,lt              // x3 has the tc_index value

//    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
//    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
//    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//

l1.88:
    adrp        x2, :got:gai4_ihevc_beta_table
    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]

    movi        v18.8b, #0x2
    adrp        x4, :got:gai4_ihevc_tc_table
    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]

    ldr         w5,[x2,x7,lsl #2]           // beta
    movi        v16.8h, #0x2
    ldr         w6,[x4,x3,lsl #2]           // tc
    lsl         x8,x6,#1
    cmp         x6,#0
    dup         v19.8b,w8
    sub         x7,x0,#4
    movi        v23.8b, #0x3
    beq         l1.964


    sub         x19,x0,#3
    ld1         {v15.8b},[x7],x1
    ldrb        w8,[x19]                    // -3 value
    ld1         {v1.8b},[x7],x1
    ldrb        w10,[x19,#1]                //-2 value
    ld1         {v29.8b},[x7],x1
    ldrb        w11,[x19,#2]                //-1 value
    ld1         {v0.8b},[x7]
    ldrb        w12,[x0,#0]                 // 0 value
    ldrb        w9,[x0,#1]                  // 1 value
    trn1        v24.8b,v15.8b,v1.8b
    trn2        v1.8b,v15.8b,v1.8b
    ldrb        w2,[x0,#2]                  // 2 value
    trn1        v2.8b,v29.8b,v0.8b
    trn2        v0.8b,v29.8b,v0.8b
    add         x12,x12,x2
    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
    csneg       x9,x9,x9,pl
//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
    mov         v29.8b,v24.8b
    trn1        v24.4h,v29.4h,v2.4h
    trn2        v2.4h,v29.4h,v2.4h
    add         x8,x8,x11
    mov         v15.8b,v1.8b
    trn1        v1.4h,v15.4h,v0.4h
    trn2        v0.4h,v15.4h,v0.4h
    subs        x8,x8,x10,lsl #1
    csneg       x8,x8,x8,pl
//  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//



    add         x14,x1,x1,lsl #1
    add         x14,x0,x14

    sub         x19,x14,#3
    dup         v4.2s, v24.s[1]
    ldrb        w2,[x19]                    // -2 value
    dup         v7.2s, v2.s[1]
    ldrb        w10,[x19,#1]                // -2 value
    dup         v3.2s, v2.s[0]
    ldrb        w11,[x19,#2]                // -1 value
    dup         v5.2s, v1.s[1]
    ldrb        w12,[x14,#0]                // 0 value
    dup         v6.2s, v1.s[0]
    ldrb        w3,[x14,#1]                 // 1 value
    dup         v2.2s, v0.s[0]
    ldrb        w4,[x14,#2]                 // 2 value


    add         x12,x12,x4
    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
    csneg       x12,x12,x12,pl
//    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//


    add         x2,x2,x11
    subs        x11,x2,x10,lsl #1
    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
//    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//



    add         x3,x8,x9                    // x3 has the d0 value
    add         x4,x11,x12                  // x4 has the d3 value


//    d0 = dp0 + dq0//
//    d3 = dp3 + dq3//

    add         x14,x8,x11                  // x13 has the value dp
    add         x12,x12,x9                  // x12 has the value  dq
//    dp = dp0 + dp3//
//   dq = dq0 + dq3//

    add         x11, x3, x4                 // x3 has the value d

//   d = d0 + d3//


    cmp         x11,x5
    dup         v22.2s, v0.s[1]
    bge         l1.964

//    if(d < beta)


    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11

    // registers for use: x2,x7,x8,x9,x10,
    uqsub       v30.8b,v7.8b,v19.8b
    asr         x10,x5,#2
    uqadd       v31.8b,v7.8b,v19.8b
    cmp         x10,x3,lsl #1
    uaddl       v0.8h,v5.8b,v4.8b
    ble         l1.336

    sub         x19,x0,4
    ldrb        w2,[x19]
    uaddw       v0.8h,  v0.8h ,  v2.8b
    ldrb        w7,[x19,#3]
    umull       v20.8h, v7.8b, v23.8b
    ldrb        w3,[x0,#0]
    umlal       v20.8h, v22.8b, v18.8b
    ldrb        w8,[x0,#3]
//   ubfx   x7,x2,#24,#8           // has the -1 value
//  and    x2,#0xff               // has the -4 value
//  ubfx   x8,x3,#24,#8           // has the 3 value
//  and    x3,#0xff               // x4 has the 0 value

    add         v20.8h,  v20.8h ,  v0.8h
    subs        x8,x8,x3
    rshrn       v22.8b,v20.8h,#3
    csneg       x8,x8,x8,pl
    subs        x2,x2,x7
    umin        v21.8b,  v22.8b ,  v31.8b
    csneg       x2,x2,x2,pl
    umax        v22.8b,  v21.8b ,  v30.8b
    add         x8,x8,x2
    uaddl       v20.8h,v7.8b,v3.8b
    cmp         x8,x5,asr #3
    mla         v20.8h, v0.8h, v16.8h
    bge         l1.336
    uaddw       v0.8h,  v0.8h ,  v7.8b
    subs        x7,x3,x7
    rshrn       v20.8b,v20.8h,#3
    csneg       x7,x7,x7,pl
    rshrn       v0.8b,v0.8h,#2
    mov         x10,#5
    uqadd       v30.8b,v5.8b,v19.8b
    mul         x10, x10, x6
    uqsub       v31.8b,v5.8b,v19.8b
    add         x10, x10,#1
    cmp         x7,x10,asr #1
    bge         l1.336


//        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
//            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )


    asr         x10,x5,#2
    uqsub       v25.8b,v4.8b,v19.8b
    cmp         x10,x4,lsl #1
    uqadd       v21.8b,v4.8b,v19.8b
    ble         l1.336
    umin        v26.8b,  v20.8b ,  v21.8b
    add         x4,x1,x1,lsl #1
    add         x4,x4,x0
    umax        v20.8b,  v26.8b ,  v25.8b
    sub         x19,x4,#4
    ldrb        w2,[x19]
    umin        v19.8b,  v0.8b ,  v30.8b
    ldrb        w7,[x19,#3]
    umax        v21.8b,  v19.8b ,  v31.8b
    ldrb        w3,[x4,#0]
    lsl         x10,x6,#1
    ldrb        w8,[x4,#3]
//   ubfx   x7,x2,#24,#8           // has the -1 value
//  and    x2,#0xff               // has the -4 value
//  ubfx   x8,x3,#24,#8           // has the 3 value
//  and    x3,#0xff               // x4 has the 0 value
    uaddl       v0.8h,v2.8b,v3.8b
    dup         v19.8b,w10
    subs        x8,x8,x3
    uaddw       v0.8h,  v0.8h ,  v4.8b
    csneg       x8,x8,x8,pl
    uqadd       v30.8b,v2.8b,v19.8b
    subs        x2,x2,x7
    uqsub       v31.8b,v2.8b,v19.8b
    csneg       x2,x2,x2,pl
    uaddl       v26.8h,v5.8b,v6.8b
    add         x8,x8,x2
    mla         v26.8h, v0.8h, v16.8h
    cmp         x8,x5,asr #3
    bge         l1.336
    rshrn       v26.8b,v26.8h,#3
    subs        x7,x3,x7
    uqadd       v27.8b,v3.8b,v19.8b
    csneg       x7,x7,x7,pl
    uqsub       v28.8b,v3.8b,v19.8b
    mov         x10,#5
    umin        v16.8b,  v26.8b ,  v30.8b
    mul         x10, x10, x6
    add         x10, x10,#1
    cmp         x7,x10,asr #1
    umax        v26.8b,  v16.8b ,  v31.8b
    bge         l1.336
    uqadd       v30.8b,v6.8b,v19.8b

    mov         x2,#2
    mov         x4,x21
    uqsub       v31.8b,v6.8b,v19.8b
    mov         x5,x22
    b           end_dep_deq_decision
// x2 has the value of de
// x6 has teh value of tc
// x5 has the value of beta
// x14 has the value of dp
// x12 has the value of dq
// x0 has the value of source address
// x1 has the src stride

l1.336:
    mov         x2,#1
l1.424:
    mov         x11,x5
    mov         x4,x21
    mov         x5,x22

    cmp         x6,#1
    mov         x20,#0
    csel        x9, x20, x9,eq
    mov         x20,#0
    csel        x10, x20, x10,eq
    beq         end_dep_deq_decision

    and         x7,x4,x5

    cmp         x7,#1
    beq         both_flags_set
    cmp         x4,#0
    beq         set_flag_dep_zero


    add         x8,x11,x11,asr #1
    mov         x10,#0
    asr         x8,x8,#3
    cmp         x8,x14
    mov         x20,#1
    csel        x9, x20, x9,gt
    mov         x20,#0
    csel        x9, x20, x9,le
    b           end_dep_deq_decision
set_flag_dep_zero:

    add         x8,x11,x11,asr #1
    mov         x9,#0
    asr         x8,x8,#3
    cmp         x8,x12
    mov         x20,#1
    csel        x10, x20, x10,gt
    mov         x20,#0
    csel        x10, x20, x10,le
    b           end_dep_deq_decision

both_flags_set:
    add         x8,x11,x11,asr #1
    asr         x8,x8,#3
    cmp         x8,x14
    mov         x20,#1
    csel        x9, x20, x9,gt
    mov         x20,#0
    csel        x9, x20, x9,le
    cmp         x8,x12
    mov         x20,#1
    csel        x10, x20, x10,gt
    mov         x20,#0
    csel        x10, x20, x10,le
end_dep_deq_decision:

//x0=source address
//x1=stride
// x2 =de
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
//    b    l1.964


    cmp         x2,#2
// x4 has the value of de
    bne         l1.968

    cmp         x5,#0
    beq         l1.780
// x5 has the flag of q

    add         x3,x0,#2
    st1         {v22.b}[0],[x3],x1

    st1         {v22.b}[1],[x3],x1

    st1         {v22.b}[2],[x3],x1

    st1         {v22.b}[3],[x3]
    add         x3,x0,x1
    mov         v29.8b,v20.8b
    trn1        v20.8b,v29.8b,v21.8b
    trn2        v21.8b,v29.8b,v21.8b

    st1         {v20.h}[0],[x0]
    st1         {v21.h}[0],[x3],x1
    st1         {v20.h}[1],[x3],x1
    st1         {v21.h}[1],[x3]


l1.780:
    cmp         x4,#0
    beq         l1.964
    // x4 has the flag p


    dup         v7.2s, v24.s[0]
    sub         x3,x0,#1
    uaddw       v16.8h,  v0.8h ,  v6.8b
    add         x7,x3,x1
    rshrn       v2.8b,v16.8h,#2
    st1         {v26.b}[0],[x3]
    sub         x0,x0,#3
    umin        v16.8b,  v2.8b ,  v27.8b
    st1         {v26.b}[1],[x7],x1
    umull       v2.8h, v6.8b, v23.8b
    umlal       v2.8h, v7.8b, v18.8b
    st1         {v26.b}[2],[x7],x1
    umax        v5.8b,  v16.8b ,  v28.8b
    st1         {v26.b}[3],[x7]
    add         v0.8h,  v2.8h ,  v0.8h
    rshrn       v0.8b,v0.8h,#3


    umin        v1.8b,  v0.8b ,  v30.8b
    umax        v0.8b,  v1.8b ,  v31.8b

    mov         v29.8b,v0.8b
    trn1        v0.8b,v29.8b,v5.8b
    trn2        v5.8b,v29.8b,v5.8b
    st1         {v0.h}[0],[x0],x1
    st1         {v5.h}[0],[x0],x1
    st1         {v0.h}[1],[x0],x1
    st1         {v5.h}[1],[x0]
l1.964:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
    ret

l1.968:


    movi        v0.8h, #0x9
    neg         x11, x6
    cmp         x4,#0
    // checks for the flag p
    movi        v16.8h, #0x3
    movi        v24.8b, #0x1


    dup         v30.8b,w11
    and         x11,x6,#0xff
    dup         v31.8b,w11

    usubl       v18.8h,v4.8b,v2.8b
    mul         v18.8h, v18.8h, v0.8h
    usubl       v0.8h,v5.8b,v3.8b



    mul         v16.8h, v0.8h, v16.8h
    sub         v16.8h,  v18.8h ,  v16.8h
    srshr       v16.8h,v16.8h,#4
//   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//

    abs         v0.8h, v16.8h
    xtn         v0.8b,  v0.8h
    // storing the absolute values of delta in d0

    sqxtn       v16.8b,v16.8h
    // storing the clipped values of delta in d16

    movi        v1.8b, #0xa
    dup         v21.8b,w11
    mul         v1.8b, v1.8b, v21.8b
    // d1 stores the value (10 * tc)

//if(abs(delta) < 10 * tc)

    smin        v18.8b,  v16.8b ,  v31.8b
    smax        v20.8b,  v18.8b ,  v30.8b

// delta = clip3(delta, -tc, tc)//
    sxtl        v16.8h, v20.8b
    uxtl        v18.8h, v2.8b
    add         v18.8h,  v18.8h ,  v16.8h

    sqxtun      v22.8b, v18.8h
    uxtl        v18.8h, v4.8b
    sub         v16.8h,  v18.8h ,  v16.8h
    sqxtun      v23.8b, v16.8h
// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
//  tmp_q0 = clip_u8(pu1_src[0] - delta)//
    beq         l1.1272



    cmp         x9,#1
    bne         l1.1212
// checks for the flag dep

    asr         x3,x6,#1


    uaddl       v16.8h,v6.8b,v2.8b
    uaddw       v16.8h,  v16.8h ,  v24.8b
    dup         v18.8b,w3
    sub         x20,x3,#0
    neg         x3, x20
    dup         v19.8b,w3
    ushr        v16.8h,v16.8h,#1
    xtn         v16.8b,  v16.8h

    usubl       v16.8h,v16.8b,v3.8b
    saddw       v16.8h,  v16.8h ,  v20.8b
    sshr        v16.8h,v16.8h,#1
    sqxtn       v16.8b,v16.8h

    smin        v17.8b,  v16.8b ,  v18.8b
    smax        v16.8b,  v19.8b ,  v17.8b




    uxtl        v18.8h, v3.8b
    sxtl        v16.8h, v16.8b
    add         v16.8h,  v18.8h ,  v16.8h

    sqxtun      v16.8b, v16.8h
    mov         v30.8b,v3.8b
    cmhs        v3.8b,v0.8b,v1.8b


    bsl         v3.8b,v30.8b,v16.8b
l1.1212:
    dup         v16.8b,w11
    sub         x12,x0,#3
    sub         x3,x0,#1
//     smul v16.8b, v16.8b, v1.8b
    mov         v29.8b,v6.8b
    trn1        v6.8b,v29.8b,v3.8b
    trn2        v3.8b,v29.8b,v3.8b
    st1         {v6.h}[0],[x12],x1
    cmhs        v16.8b,v0.8b,v1.8b
    st1         {v3.h}[0],[x12],x1
    bsl         v16.8b,v2.8b,v22.8b
    st1         {v16.b}[0],[x3],x1
    st1         {v16.b}[1],[x3],x1
    st1         {v6.h}[1],[x12],x1
    st1         {v16.b}[2],[x3],x1
    st1         {v3.h}[1],[x12]
    st1         {v16.b}[3],[x3]
l1.1272:
    cmp         x5,#0
    beq         l1.964
    // checks for the flag q
    cmp         x10,#1
    bne         l1.1412
    // checks for the flag deq
    mov         v2.8b,v7.8b
    asr         x3,x6,#1

    dup         v6.8b,w3
    sub         x20,x3,#0
    neg         x3, x20
    dup         v16.8b,w3
    uaddl       v2.8h,v2.8b,v4.8b
    uaddw       v2.8h,  v2.8h ,  v24.8b
    ushr        v2.8h,v2.8h,#1
    xtn         v2.8b,  v2.8h

    usubl       v2.8h,v2.8b,v5.8b
    ssubw       v2.8h,  v2.8h ,  v20.8b
    sshr        v2.8h,v2.8h,#1
    sqxtn       v3.8b,v2.8h

    smin        v2.8b,  v3.8b ,  v6.8b
    smax        v3.8b,  v16.8b ,  v2.8b
    //  dup  v6.8b,w2
    //   smul v6.8b, v6.8b, v1.8b



    uxtl        v16.8h, v5.8b
    sxtl        v2.8h, v3.8b
    add         v2.8h,  v16.8h ,  v2.8h
    sqxtun      v3.8b, v2.8h
    mov         v30.8b,v5.8b
    cmhs        v5.8b,v0.8b,v1.8b


    bsl         v5.8b,v30.8b,v3.8b
l1.1412:
    //  dup  v2.8b,w2
    add         x3,x0,#2
    add         x11,x3,x1
    //   smul v1.8b, v2.8b, v1.8b
    st1         {v7.b}[0],[x3]
    st1         {v7.b}[1],[x11],x1
    st1         {v7.b}[2],[x11],x1
    cmhs        v0.8b,v0.8b,v1.8b
    st1         {v7.b}[3],[x11]
    bsl         v0.8b,v4.8b,v23.8b
    mov         v29.8b,v0.8b
    trn1        v0.8b,v29.8b,v5.8b
    trn2        v5.8b,v29.8b,v5.8b
    st1         {v0.h}[0],[x0],x1
    st1         {v5.h}[0],[x0],x1
    st1         {v0.h}[1],[x0],x1
    st1         {v5.h}[1],[x0]

    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
    ret