///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_intra_pred_filters_planar.s
//*
//* @brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* @author
//*  akshaya mukund
//*
//* @par list of functions:
//*
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//*    luma intraprediction filter for planar input
//*
//* @par description:
//*
//* @param[in] pu1_ref
//*  uword8 pointer to the source
//*
//* @param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] pi1_coeff
//*  word8 pointer to the planar coefficients
//*
//* @param[in] nt
//*  size of tranform block
//*
//* @param[in] mode
//*  type of filtering
//*
//* @returns
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/

//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
//                                  word32 src_strd,
//                                  uword8* pu1_dst,
//                                  word32 dst_strd,
//                                  word32 nt,
//                                  word32 mode,
//                   word32 pi1_coeff)
//**************variables vs registers*****************************************
//x0 => *pu1_ref
//x1 => src_strd
//x2 => *pu1_dst
//x3 => dst_strd

//stack contents from #40
//    nt
//    mode
//    pi1_coeff

.text
.align 4
.include "ihevc_neon_macros.s"



.globl ihevc_intra_pred_luma_planar_av8
.extern gau1_ihevc_planar_factor
.extern gau1_ihevc_planar_factor_1

.type ihevc_intra_pred_luma_planar_av8, %function

ihevc_intra_pred_luma_planar_av8:

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!

    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]

    clz         w5,w4
    sub         x20, x5, #32
    neg         x5, x20
    dup         v29.8h,w5
    neg         v29.8h, v29.8h              //shr value (so vneg)
    dup         v2.8b,w4                    //nt
    dup         v16.8h,w4                   //nt

    sub         x6, x4, #1                  //nt-1
    add         x6, x6, x0
    ldr         w7,  [x6]
    sxtw        x7,w7
    dup         v0.8b,w7                    //src[nt-1]

    add         x6, x4, x4,lsl #1           //3nt
    add         x6, x6, #1                  //3nt + 1
    add         x6, x6, x0
    ldr         w7,  [x6]
    sxtw        x7,w7
    dup         v1.8b,w7                    //src[3nt+1]

    add         x6, x4, x4                  //2nt
    add         x14, x6, #1                 //2nt+1
    sub         x6, x6, #1                  //2nt-1
    add         x6, x6, x0                  //&src[2nt-1]
    add         x14, x14, x0                //&src[2nt+1]

    mov         x8, #1                      //row+1 (row is first 0)
    sub         x9, x4, x8                  //nt-1-row (row is first 0)

    dup         v5.8b,w8                    //row + 1
    dup         v6.8b,w9                    //nt - 1 - row
    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row

    add         x12, x11, #1                //coeffs (to be reloaded after every row)
    mov         x1, x4                      //nt (row counter) (dec after every row)
    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
    mov         x10, #8                     //increment for the coeffs
    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)

    cmp         x4, #4
    beq         tf_sz_4

//@ ========== ***************** =====================
prolog:
tf_sz_8_16_32:

    mov         x7, x4                      //column counter (set to no of cols)
    lsr         x9, x4, #3                  //divide nt by 8
    mul         x7, x7, x9                  //multiply width * height
    adrp        x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
    ldr         x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
    sub         x6, x6, #7
    mov         x8, x2
    lsl         x9, x3, #3                  //4*stride
    sub         x20, x9, #8                 //8-4*stride
    neg         x9, x20
    mov         x10, x4                     //nt
    sub         x10, x10, #8                //nt - 8

col_loop_8_16_32:

    ld1         {v17.8b},[x12]              //(1-8)load 8 coeffs [col+1]
    dup         v27.8h,w4                   //(1)
    ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
    sub         v19.8b,  v2.8b ,  v17.8b    //(1-8)[nt-1-col]


    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]

    ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]

    dup         v20.8b, v4.8b[7]            //(1)
    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]

    dup         v21.8b, v4.8b[6]            //(2)
    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]

    dup         v30.8h,w4                   //(2)
    add         v5.8b,  v5.8b ,  v7.8b      //(1)

    sub         v6.8b,  v6.8b ,  v7.8b      //(1)

    dup         v22.8b, v4.8b[5]            //(3)
    umlal       v30.8h, v5.8b, v0.8b        //(2)

    dup         v28.8h,w4                   //(3)
    umlal       v30.8h, v17.8b, v1.8b       //(2)

    umlal       v30.8h, v6.8b, v3.8b        //(2)
    umlal       v30.8h, v19.8b, v21.8b      //(2)

    sshl        v27.8h, v27.8h, v29.8h      //(1)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(2)
    sub         v6.8b,  v6.8b ,  v7.8b      //(2)

    xtn         v27.8b,  v27.8h             //(1)
    umlal       v28.8h, v5.8b, v0.8b        //(3)

    dup         v23.8b, v4.8b[4]            //(4)
    umlal       v28.8h, v17.8b, v1.8b       //(3)

    dup         v25.8h,w4                   //(4)
    umlal       v28.8h, v6.8b, v3.8b        //(3)

    st1         {v27.8b},[x2], x3           //(1)str 8 values
    umlal       v28.8h, v19.8b, v22.8b      //(3)

    sshl        v30.8h, v30.8h, v29.8h      //(2)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(3)
    sub         v6.8b,  v6.8b ,  v7.8b      //(3)

    xtn         v30.8b,  v30.8h             //(2)
    umlal       v25.8h, v5.8b, v0.8b        //(4)

    dup         v20.8b, v4.8b[3]            //(5)
    umlal       v25.8h, v17.8b, v1.8b       //(4)

    dup         v16.8h,w4                   //(5)
    umlal       v25.8h, v6.8b, v3.8b        //(4)

    st1         {v30.8b},[x2], x3           //(2)str 8 values
    umlal       v25.8h, v19.8b, v23.8b      //(4)

    sshl        v28.8h, v28.8h, v29.8h      //(3)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(4)
    sub         v6.8b,  v6.8b ,  v7.8b      //(4)

    xtn         v28.8b,  v28.8h             //(3)
    umlal       v16.8h, v5.8b, v0.8b        //(5)

    dup         v21.8b, v4.8b[2]            //(6)
    umlal       v16.8h, v17.8b, v1.8b       //(5)

    dup         v18.8h,w4                   //(6)
    umlal       v16.8h, v6.8b, v3.8b        //(5)

    st1         {v28.8b},[x2], x3           //(3)str 8 values
    umlal       v16.8h, v19.8b, v20.8b      //(5)

    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
    add         v5.8b,  v5.8b ,  v7.8b      //(5)
    sub         v6.8b,  v6.8b ,  v7.8b      //(5)

    xtn         v25.8b,  v25.8h             //(4)
    umlal       v18.8h, v5.8b, v0.8b        //(6)

    dup         v22.8b, v4.8b[1]            //(7)
    umlal       v18.8h, v17.8b, v1.8b       //(6)

    dup         v26.8h,w4                   //(7)
    umlal       v18.8h, v6.8b, v3.8b        //(6)

    st1         {v25.8b},[x2], x3           //(4)str 8 values
    umlal       v18.8h, v19.8b, v21.8b      //(6)

    sshl        v16.8h, v16.8h, v29.8h      //(5)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(6)
    sub         v6.8b,  v6.8b ,  v7.8b      //(6)

    xtn         v16.8b,  v16.8h             //(5)
    umlal       v26.8h, v5.8b, v0.8b        //(7)

    dup         v23.8b, v4.8b[0]            //(8)
    umlal       v26.8h, v17.8b, v1.8b       //(7)

    dup         v24.8h,w4                   //(8)
    umlal       v26.8h, v6.8b, v3.8b        //(7)

    st1         {v16.8b},[x2], x3           //(5)str 8 values
    umlal       v26.8h, v19.8b, v22.8b      //(7)

    sshl        v18.8h, v18.8h, v29.8h      //(6)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(7)
    sub         v6.8b,  v6.8b ,  v7.8b      //(7)

    xtn         v18.8b,  v18.8h             //(6)
    umlal       v24.8h, v5.8b, v0.8b        //(8)


    umlal       v24.8h, v17.8b, v1.8b       //(8)

    umlal       v24.8h, v6.8b, v3.8b        //(8)

    st1         {v18.8b},[x2], x3           //(6)str 8 values
    umlal       v24.8h, v19.8b, v23.8b      //(8)

    sshl        v26.8h, v26.8h, v29.8h      //(7)shr

    subs        x7, x7, #8

    beq         epilog

    subs        x1, x1, #8                  //row counter
    add         x20, x12, #8                //col inc
    csel        x12, x20, x12,gt
    add         x20, x14, #8                //also for col inc
    csel        x14, x20, x14,gt
    csel        x1, x4, x1,le               //nt reloaded (refresh the value)
    add         x20, x11, #1                //x12 reset
    csel        x12, x20, x12,le

    csel        x14, x0, x14,le             //x14 reset
    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]

    sub         x20, x6, #8                 //for next set of rows
    csel        x6, x20, x6,le
    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]

    add         x20, x5, #8
    csel        x5, x20, x5,le
    dup         v27.8h,w4                   //(1n)(1)

    ld1         {v5.8b},[x5]

    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]

    dup         v20.8b, v4.8b[7]            //(1n)(1)
    sub         v6.8b,  v2.8b ,  v5.8b

    beq         epilog

kernel_plnr:

    cmp         x1, #0                      // (cond loop)
    sshl        v24.8h, v24.8h, v29.8h      //(8)shr

    xtn         v26.8b,  v26.8h             //(7)
    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]

    xtn         v24.8b,  v24.8h             //(8)
    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]

    dup         v21.8b, v4.8b[6]            //(2)
    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]

    dup         v30.8h,w4                   //(2)
    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]

    st1         {v26.8b},[x2], x3           //(7)str 8 values
    add         v5.8b,  v5.8b ,  v7.8b      //(1)

    st1         {v24.8b},[x2], x3           //(8)str 8 values
    sub         v6.8b,  v6.8b ,  v7.8b      //(1)

    add         x20, x2, x9                 //since more cols to fill, dst + 8 - 6*strd (cond loop)
    csel        x2, x20, x2,gt
    umlal       v30.8h, v5.8b, v0.8b        //(2)

    sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
    csel        x2, x20, x2,le
    umlal       v30.8h, v17.8b, v1.8b       //(2)

    dup         v22.8b, v4.8b[5]            //(3)
    umlal       v30.8h, v6.8b, v3.8b        //(2)

    dup         v28.8h,w4                   //(3)
    umlal       v30.8h, v19.8b, v21.8b      //(2)

    sshl        v27.8h, v27.8h, v29.8h      //(1)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(2)
    csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)

    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
    subs        x1, x1, #8                  //row counter (loop)

    xtn         v27.8b,  v27.8h             //(1)
    umlal       v28.8h, v5.8b, v0.8b        //(3)

    dup         v23.8b, v4.8b[4]            //(4)
    umlal       v28.8h, v17.8b, v1.8b       //(3)

    dup         v25.8h,w4                   //(4)
    umlal       v28.8h, v6.8b, v3.8b        //(3)

    st1         {v27.8b},[x2], x3           //(1)str 8 values
    umlal       v28.8h, v19.8b, v22.8b      //(3)

    sshl        v30.8h, v30.8h, v29.8h      //(2)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(3)

    sub         v6.8b,  v6.8b ,  v7.8b      //(3)

    xtn         v30.8b,  v30.8h             //(2)
    umlal       v25.8h, v5.8b, v0.8b        //(4)

    dup         v20.8b, v4.8b[3]            //(5)
    umlal       v25.8h, v17.8b, v1.8b       //(4)

    dup         v16.8h,w4                   //(5)
    umlal       v25.8h, v6.8b, v3.8b        //(4)

    st1         {v30.8b},[x2], x3           //(2)str 8 values
    umlal       v25.8h, v19.8b, v23.8b      //(4)

    sshl        v28.8h, v28.8h, v29.8h      //(3)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(4)

    sub         v6.8b,  v6.8b ,  v7.8b      //(4)

    xtn         v28.8b,  v28.8h             //(3)
    umlal       v16.8h, v5.8b, v0.8b        //(5)

    dup         v21.8b, v4.8b[2]            //(6)
    umlal       v16.8h, v17.8b, v1.8b       //(5)

    dup         v18.8h,w4                   //(6)
    umlal       v16.8h, v6.8b, v3.8b        //(5)

    st1         {v28.8b},[x2], x3           //(3)str 8 values
    umlal       v16.8h, v19.8b, v20.8b      //(5)

    add         x20, x11, #1                //x12 reset (cond loop)
    csel        x12, x20, x12,le
    sshl        v25.8h, v25.8h, v29.8h      //(4)shr

    add         x20, x12, #8                //col inc (cond loop)
    csel        x12, x20, x12,gt
    add         v5.8b,  v5.8b ,  v7.8b      //(5)

    add         x20, x14, #8                //also for col inc (cond loop)
    csel        x14, x20, x14,gt
    sub         v6.8b,  v6.8b ,  v7.8b      //(5)

    xtn         v25.8b,  v25.8h             //(4)
    umlal       v18.8h, v5.8b, v0.8b        //(6)

    dup         v22.8b, v4.8b[1]            //(7)
    umlal       v18.8h, v17.8b, v1.8b       //(6)

    dup         v26.8h,w4                   //(7)
    umlal       v18.8h, v6.8b, v3.8b        //(6)

    st1         {v25.8b},[x2], x3           //(4)str 8 values
    umlal       v18.8h, v19.8b, v21.8b      //(6)

    csel        x14, x0, x14,le             //x14 reset (cond loop)
    sshl        v16.8h, v16.8h, v29.8h      //(5)shr

    sub         x20, x6, #8                 //for next set of rows (cond loop)
    csel        x6, x20, x6,le
    add         v5.8b,  v5.8b ,  v7.8b      //(6)

    add         x20, x5, #8                 // (cond loop)
    csel        x5, x20, x5,le
    sub         v6.8b,  v6.8b ,  v7.8b      //(6)

    xtn         v16.8b,  v16.8h             //(5)
    umlal       v26.8h, v5.8b, v0.8b        //(7)

    dup         v23.8b, v4.8b[0]            //(8)
    umlal       v26.8h, v17.8b, v1.8b       //(7)

    dup         v24.8h,w4                   //(8)
    umlal       v26.8h, v6.8b, v3.8b        //(7)

    st1         {v16.8b},[x2], x3           //(5)str 8 values
    umlal       v26.8h, v19.8b, v22.8b      //(7)

    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
    sshl        v18.8h, v18.8h, v29.8h      //(6)shr

    add         v5.8b,  v5.8b ,  v7.8b      //(7)

    sub         v6.8b,  v6.8b ,  v7.8b      //(7)

    xtn         v18.8b,  v18.8h             //(6)
    umlal       v24.8h, v5.8b, v0.8b        //(8)

    ld1         {v5.8b},[x5]                //(row+1 value)
    umlal       v24.8h, v17.8b, v1.8b       //(8)

    dup         v20.8b, v4.8b[7]            //(1n)(1)
    umlal       v24.8h, v6.8b, v3.8b        //(8)

    st1         {v18.8b},[x2], x3           //(6)str 8 values
    umlal       v24.8h, v19.8b, v23.8b      //(8)

    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
    sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value

    subs        x7, x7, #8                  //col counter

    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
    sshl        v26.8h, v26.8h, v29.8h      //(7)shr

    dup         v27.8h,w4                   //(1n)(1)
    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]

    bne         kernel_plnr

epilog:

    xtn         v26.8b,  v26.8h             //(7)
    st1         {v26.8b},[x2], x3           //(7)str 8 values

    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
    xtn         v24.8b,  v24.8h             //(8)
    st1         {v24.8b},[x2], x3           //(8)str 8 values

//@ ========== ***************** =====================

    beq         end_loop

tf_sz_4:
    ld1         {v25.8b},[x14]              //load src[2nt+1+col]
    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
loop_sz_4:
    mov         x10, #4                     //reduce inc to #4 for 4x4
    ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
    sxtw        x7,w7
    dup         v4.8b,w7                    //src[2nt-1-row]

    sub         v19.8b,  v2.8b ,  v17.8b    //[nt-1-col]

    umull       v27.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
    umlal       v27.8h, v6.8b, v25.8b       //(nt-1-row)    *    src[2nt+1+col]
    umlal       v27.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    umlal       v27.8h, v19.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
//    vadd.i16    q6, q6, q8            @add (nt)
//    vshl.s16     q6, q6, q7            @shr
//    vmovn.i16     d12, q6
    rshrn       v27.8b, v27.8h,#3
    st1         {v27.s}[0],[x2], x3

    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
    subs        x1, x1, #1

    bne         loop_sz_4

end_loop:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret