@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_intra_pred_filters_planar.s
@*
@* @brief
@*  contains function definitions for inter prediction  interpolation.
@* functions are coded using neon  intrinsics and can be compiled using

@* rvct
@*
@* @author
@*  akshaya mukund
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*    luma intraprediction filter for planar input
@*
@* @par description:
@*
@* @param[in] pu1_ref
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] pi1_coeff
@*  word8 pointer to the planar coefficients
@*
@* @param[in] nt
@*  size of tranform block
@*
@* @param[in] mode
@*  type of filtering
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
@                                  word32 src_strd,
@                                  uword8* pu1_dst,
@                                  word32 dst_strd,
@                                  word32 nt,
@                                  word32 mode,
@                  word32 pi1_coeff)
@**************variables vs registers*****************************************
@r0 => *pu1_ref
@r1 => src_strd
@r2 => *pu1_dst
@r3 => dst_strd

@stack contents from #104
@   nt
@   mode
@   pi1_coeff

.equ    nt_offset,      104

.text
.align 4




.globl ihevc_intra_pred_luma_planar_a9q
.extern gau1_ihevc_planar_factor
.extern gau1_ihevc_planar_factor_1

gau1_ihevc_planar_factor_addr:
.long gau1_ihevc_planar_factor - ulbl1 - 8

gau1_ihevc_planar_factor_1_addr:
.long gau1_ihevc_planar_factor_1 - ulbl2 - 8


.type ihevc_intra_pred_luma_planar_a9q, %function

ihevc_intra_pred_luma_planar_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    vpush       {d8 - d15}
    ldr         r4,[sp,#nt_offset]          @loads nt
    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
ulbl1:
    add         r11,r11,pc

    clz         r5, r4
    rsb         r5, r5, #32
    vdup.16     q7, r5
    vneg.s16    q7, q7                      @shr value (so vneg)
    vdup.8      d2, r4                      @nt
    vdup.s16    q8, r4                      @nt

    sub         r6, r4, #1                  @nt-1
    add         r6, r6, r0
    ldr         r7, [r6]
    vdup.s8     d0, r7                      @src[nt-1]

    add         r6, r4, r4,lsl #1           @3nt
    add         r6, r6, #1                  @3nt + 1
    add         r6, r6, r0
    ldr         r7, [r6]
    vdup.s8     d1, r7                      @src[3nt+1]

    add         r6, r4, r4                  @2nt
    add         r14, r6, #1                 @2nt+1
    sub         r6, r6, #1                  @2nt-1
    add         r6, r6, r0                  @&src[2nt-1]
    add         r14, r14, r0                @&src[2nt+1]

    mov         r8, #1                      @row+1 (row is first 0)
    sub         r9, r4, r8                  @nt-1-row (row is first 0)

    vdup.s8     d5, r8                      @row + 1
    vdup.s8     d6, r9                      @nt - 1 - row
    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row

    add         r12, r11, #1                @coeffs (to be reloaded after every row)
    mov         r1, r4                      @nt (row counter) (dec after every row)
    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
    mov         r10, #8                     @increment for the coeffs
    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)

    cmp         r4, #4
    beq         tf_sz_4

@@ ========== ***************** =====================
prolog:
tf_sz_8_16_32:

    mov         r7, r4                      @column counter (set to no of cols)
    mov         r9, r4, lsr #3              @divide nt by 8
    mul         r7, r7, r9                  @multiply width * height
    ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
ulbl2:
    add         r5,r5,pc
    sub         r6, r6, #7
    mov         r8, r2
    lsl         r9, r3, #3                  @4*stride
    rsb         r9, r9, #8                  @8-4*stride
    mov         r10, r4                     @nt
    sub         r10, r10, #8                @nt - 8

col_loop_8_16_32:

    vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
    vdup.16     q6, r4                      @(1)
    vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
    vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]


    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]

    vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]

    vdup.s8     d20, d4[7]                  @(1)
    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]

    vdup.s8     d21, d4[6]                  @(2)
    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]

    vdup.16     q15, r4                     @(2)
    vadd.s8     d5, d5, d7                  @(1)

    vsub.s8     d6, d6, d7                  @(1)

    vdup.s8     d22, d4[5]                  @(3)
    vmlal.u8    q15, d5, d0                 @(2)

    vdup.16     q14, r4                     @(3)
    vmlal.u8    q15, d8, d1                 @(2)

    vmlal.u8    q15, d6, d3                 @(2)
    vmlal.u8    q15, d9, d21                @(2)

    vshl.s16    q6, q6, q7                  @(1)shr

    vadd.s8     d5, d5, d7                  @(2)
    vsub.s8     d6, d6, d7                  @(2)

    vmovn.i16   d12, q6                     @(1)
    vmlal.u8    q14, d5, d0                 @(3)

    vdup.8      d23, d4[4]                  @(4)
    vmlal.u8    q14, d8, d1                 @(3)

    vdup.16     q5, r4                      @(4)
    vmlal.u8    q14, d6, d3                 @(3)

    vst1.s8     d12, [r2], r3               @(1)str 8 values
    vmlal.u8    q14, d9, d22                @(3)

    vshl.s16    q15, q15, q7                @(2)shr

    vadd.s8     d5, d5, d7                  @(3)
    vsub.s8     d6, d6, d7                  @(3)

    vmovn.i16   d30, q15                    @(2)
    vmlal.u8    q5, d5, d0                  @(4)

    vdup.8      d20, d4[3]                  @(5)
    vmlal.u8    q5, d8, d1                  @(4)

    vdup.16     q8, r4                      @(5)
    vmlal.u8    q5, d6, d3                  @(4)

    vst1.s8     d30, [r2], r3               @(2)str 8 values
    vmlal.u8    q5, d9, d23                 @(4)

    vshl.s16    q14, q14, q7                @(3)shr

    vadd.s8     d5, d5, d7                  @(4)
    vsub.s8     d6, d6, d7                  @(4)

    vmovn.i16   d28, q14                    @(3)
    vmlal.u8    q8, d5, d0                  @(5)

    vdup.8      d21, d4[2]                  @(6)
    vmlal.u8    q8, d8, d1                  @(5)

    vdup.16     q9, r4                      @(6)
    vmlal.u8    q8, d6, d3                  @(5)

    vst1.s8     d28, [r2], r3               @(3)str 8 values
    vmlal.u8    q8, d9, d20                 @(5)

    vshl.s16    q5, q5, q7                  @(4)shr
    vadd.s8     d5, d5, d7                  @(5)
    vsub.s8     d6, d6, d7                  @(5)

    vmovn.i16   d10, q5                     @(4)
    vmlal.u8    q9, d5, d0                  @(6)

    vdup.8      d22, d4[1]                  @(7)
    vmlal.u8    q9, d8, d1                  @(6)

    vdup.16     q13, r4                     @(7)
    vmlal.u8    q9, d6, d3                  @(6)

    vst1.s8     d10, [r2], r3               @(4)str 8 values
    vmlal.u8    q9, d9, d21                 @(6)

    vshl.s16    q8, q8, q7                  @(5)shr

    vadd.s8     d5, d5, d7                  @(6)
    vsub.s8     d6, d6, d7                  @(6)

    vmovn.i16   d16, q8                     @(5)
    vmlal.u8    q13, d5, d0                 @(7)

    vdup.8      d23, d4[0]                  @(8)
    vmlal.u8    q13, d8, d1                 @(7)

    vdup.16     q12, r4                     @(8)
    vmlal.u8    q13, d6, d3                 @(7)

    vst1.s8     d16, [r2], r3               @(5)str 8 values
    vmlal.u8    q13, d9, d22                @(7)

    vshl.s16    q9, q9, q7                  @(6)shr

    vadd.s8     d5, d5, d7                  @(7)
    vsub.s8     d6, d6, d7                  @(7)

    vmovn.i16   d18, q9                     @(6)
    vmlal.u8    q12, d5, d0                 @(8)


    vmlal.u8    q12, d8, d1                 @(8)

    vmlal.u8    q12, d6, d3                 @(8)

    vst1.s8     d18, [r2], r3               @(6)str 8 values
    vmlal.u8    q12, d9, d23                @(8)

    vshl.s16    q13, q13, q7                @(7)shr

    subs        r7, r7, #8

    beq         epilog

    subs        r1, r1, #8                  @row counter
    addgt       r12, r12, #8                @col inc
    addgt       r14, r14, #8                @also for col inc
    movle       r1, r4                      @nt reloaded (refresh the value)
    addle       r12, r11, #1                @r12 reset

    movle       r14, r0                     @r14 reset
    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]

    suble       r6, r6, #8                  @for next set of rows
    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]

    addle       r5, r5, #8
    vdup.16     q6, r4                      @(1n)(1)

    vld1.s8     d5, [r5]

    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]

    vdup.s8     d20, d4[7]                  @(1n)(1)
    vsub.s8     d6, d2, d5

    beq         epilog

kernel_plnr:

    cmp         r1, #0                      @ (cond loop)
    vshl.s16    q12, q12, q7                @(8)shr

    vmovn.i16   d26, q13                    @(7)
    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]

    vmovn.i16   d24, q12                    @(8)
    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]

    vdup.s8     d21, d4[6]                  @(2)
    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]

    vdup.16     q15, r4                     @(2)
    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]

    vst1.s8     d26, [r2], r3               @(7)str 8 values
    vadd.s8     d5, d5, d7                  @(1)

    vst1.s8     d24, [r2], r3               @(8)str 8 values
    vsub.s8     d6, d6, d7                  @(1)

    addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
    vmlal.u8    q15, d5, d0                 @(2)

    suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
    vmlal.u8    q15, d8, d1                 @(2)

    vdup.s8     d22, d4[5]                  @(3)
    vmlal.u8    q15, d6, d3                 @(2)

    vdup.16     q14, r4                     @(3)
    vmlal.u8    q15, d9, d21                @(2)

    vshl.s16    q6, q6, q7                  @(1)shr

    vadd.s8     d5, d5, d7                  @(2)
    movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)

    vsub.s8     d6, d6, d7                  @(2)
    subs        r1, r1, #8                  @row counter (loop)

    vmovn.i16   d12, q6                     @(1)
    vmlal.u8    q14, d5, d0                 @(3)

    vdup.8      d23, d4[4]                  @(4)
    vmlal.u8    q14, d8, d1                 @(3)

    vdup.16     q5, r4                      @(4)
    vmlal.u8    q14, d6, d3                 @(3)

    vst1.s8     d12, [r2], r3               @(1)str 8 values
    vmlal.u8    q14, d9, d22                @(3)

    vshl.s16    q15, q15, q7                @(2)shr

    vadd.s8     d5, d5, d7                  @(3)

    vsub.s8     d6, d6, d7                  @(3)

    vmovn.i16   d30, q15                    @(2)
    vmlal.u8    q5, d5, d0                  @(4)

    vdup.8      d20, d4[3]                  @(5)
    vmlal.u8    q5, d8, d1                  @(4)

    vdup.16     q8, r4                      @(5)
    vmlal.u8    q5, d6, d3                  @(4)

    vst1.s8     d30, [r2], r3               @(2)str 8 values
    vmlal.u8    q5, d9, d23                 @(4)

    vshl.s16    q14, q14, q7                @(3)shr

    vadd.s8     d5, d5, d7                  @(4)

    vsub.s8     d6, d6, d7                  @(4)

    vmovn.i16   d28, q14                    @(3)
    vmlal.u8    q8, d5, d0                  @(5)

    vdup.8      d21, d4[2]                  @(6)
    vmlal.u8    q8, d8, d1                  @(5)

    vdup.16     q9, r4                      @(6)
    vmlal.u8    q8, d6, d3                  @(5)

    vst1.s8     d28, [r2], r3               @(3)str 8 values
    vmlal.u8    q8, d9, d20                 @(5)

    addle       r12, r11, #1                @r12 reset (cond loop)
    vshl.s16    q5, q5, q7                  @(4)shr

    addgt       r12, r12, #8                @col inc (cond loop)
    vadd.s8     d5, d5, d7                  @(5)

    addgt       r14, r14, #8                @also for col inc (cond loop)
    vsub.s8     d6, d6, d7                  @(5)

    vmovn.i16   d10, q5                     @(4)
    vmlal.u8    q9, d5, d0                  @(6)

    vdup.8      d22, d4[1]                  @(7)
    vmlal.u8    q9, d8, d1                  @(6)

    vdup.16     q13, r4                     @(7)
    vmlal.u8    q9, d6, d3                  @(6)

    vst1.s8     d10, [r2], r3               @(4)str 8 values
    vmlal.u8    q9, d9, d21                 @(6)

    movle       r14, r0                     @r14 reset (cond loop)
    vshl.s16    q8, q8, q7                  @(5)shr

    suble       r6, r6, #8                  @for next set of rows (cond loop)
    vadd.s8     d5, d5, d7                  @(6)

    addle       r5, r5, #8                  @ (cond loop)
    vsub.s8     d6, d6, d7                  @(6)

    vmovn.i16   d16, q8                     @(5)
    vmlal.u8    q13, d5, d0                 @(7)

    vdup.8      d23, d4[0]                  @(8)
    vmlal.u8    q13, d8, d1                 @(7)

    vdup.16     q12, r4                     @(8)
    vmlal.u8    q13, d6, d3                 @(7)

    vst1.s8     d16, [r2], r3               @(5)str 8 values
    vmlal.u8    q13, d9, d22                @(7)

    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    vshl.s16    q9, q9, q7                  @(6)shr

    vadd.s8     d5, d5, d7                  @(7)

    vsub.s8     d6, d6, d7                  @(7)

    vmovn.i16   d18, q9                     @(6)
    vmlal.u8    q12, d5, d0                 @(8)

    vld1.s8     d5, [r5]                    @(row+1 value)
    vmlal.u8    q12, d8, d1                 @(8)

    vdup.s8     d20, d4[7]                  @(1n)(1)
    vmlal.u8    q12, d6, d3                 @(8)

    vst1.s8     d18, [r2], r3               @(6)str 8 values
    vmlal.u8    q12, d9, d23                @(8)

    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
    vsub.s8     d6, d2, d5                  @(nt-1-row) value

    subs        r7, r7, #8                  @col counter

    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
    vshl.s16    q13, q13, q7                @(7)shr

    vdup.16     q6, r4                      @(1n)(1)
    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]

    bne         kernel_plnr

epilog:

    vmovn.i16   d26, q13                    @(7)
    vst1.s8     d26, [r2], r3               @(7)str 8 values

    vshl.s16    q12, q12, q7                @(8)shr
    vmovn.i16   d24, q12                    @(8)
    vst1.s8     d24, [r2], r3               @(8)str 8 values

@@ ========== ***************** =====================

    beq         end_loop

tf_sz_4:
    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
loop_sz_4:
    mov         r10, #4                     @reduce inc to #4 for 4x4
    ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
    vdup.s8     d4, r7                      @src[2nt-1-row]

    vsub.s8     d9, d2, d8                  @[nt-1-col]

    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
@   vadd.i16    q6, q6, q8          @add (nt)
@   vshl.s16    q6, q6, q7          @shr
@   vmovn.i16   d12, q6
    vrshrn.s16  d12,q6,#3
    vst1.s32    {d12[0]}, [r2], r3

    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
    vsub.s8     d6, d6, d7                  @[nt-1-row]--
    subs        r1, r1, #1

    bne         loop_sz_4

end_loop:
    vpop        {d8 - d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp