@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_intra_pred_luma_horz_neon.s
@*
@* @brief
@*  contains function definition for intra prediction  interpolation filters
@*
@*
@* @author
@*  parthiban v
@*
@* @par list of functions:
@*  - ihevc_intra_pred_luma_horz()
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@
@/**
@*******************************************************************************
@*
@* @brief
@*     intra prediction interpolation filter for horizontal luma variable.
@*
@* @par description:
@*      horizontal intraprediction(mode 10) with.extern  samples location
@*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
@*      to section 8.4.4.2.6 in the standard (special case)
@*
@* @param[in] pu1_src
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] nt
@*  integer transform block size
@*
@* @param[in] mode
@*  integer intraprediction mode
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
@                                word32 src_strd,
@                                uword8 *pu1_dst,
@                                word32 dst_strd,
@                                word32 nt,
@                                word32 mode)
@**************variables vs registers*****************************************
@r0 => *pu1_ref
@r1 =>  src_strd
@r2 => *pu1_dst
@r3 =>  dst_strd

.equ    nt_offset,      104

.text
.align 4




.globl ihevc_intra_pred_luma_horz_a9q

.type ihevc_intra_pred_luma_horz_a9q, %function

ihevc_intra_pred_luma_horz_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    vpush       {d8 - d15}
    ldr         r4,[sp,#nt_offset]          @loads nt

    lsl         r6,r4,#1                    @two_nt

    add         r12,r0,r6                   @*pu1_ref[two_nt]
    cmp         r4,#4                       @if nt == 4
    beq         core_loop_4

    cmp         r4,#8                       @if nt == 8
    beq         core_loop_8

    cmp         r4,#16                      @if nt == 16
    beq         core_loop_16
    sub         r12,r12,#16                 @move to 16th value pointer
    add         r9,r2,#16

core_loop_32:
    vld1.8      {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.

    vdup.8      q1,d1[7]                    @duplicate the i value.

    vdup.8      q2,d1[6]                    @duplicate the ii value.
    vdup.8      q3,d1[5]                    @duplicate the iii value.
    vst1.8      {q1},[r2],r3                @store in 1st row 0-16 columns
    vst1.8      {q1},[r9],r3                @store in 1st row 16-32 columns

    vdup.8      q4,d1[4]
    vst1.8      {q2},[r2],r3
    vst1.8      {q2},[r9],r3

    vdup.8      q1,d1[3]
    vst1.8      {q3},[r2],r3
    vst1.8      {q3},[r9],r3

    vdup.8      q2,d1[2]
    vst1.8      {q4},[r2],r3
    vst1.8      {q4},[r9],r3

    vdup.8      q3,d1[1]
    vst1.8      {q1},[r2],r3
    vst1.8      {q1},[r9],r3

    vdup.8      q4,d1[0]
    vst1.8      {q2},[r2],r3
    vst1.8      {q2},[r9],r3

    vdup.8      q1,d0[7]
    vst1.8      {q3},[r2],r3
    vst1.8      {q3},[r9],r3

    vdup.8      q2,d0[6]
    vst1.8      {q4},[r2],r3
    vst1.8      {q4},[r9],r3

    vdup.8      q3,d0[5]
    vst1.8      {q1},[r2],r3
    vst1.8      {q1},[r9],r3

    vdup.8      q4,d0[4]
    vst1.8      {q2},[r2],r3
    vst1.8      {q2},[r9],r3

    vdup.8      q1,d0[3]
    vst1.8      {q3},[r2],r3
    vst1.8      {q3},[r9],r3

    vdup.8      q2,d0[2]
    vst1.8      {q4},[r2],r3
    vst1.8      {q4},[r9],r3

    vdup.8      q3,d0[1]
    vst1.8      {q1},[r2],r3
    vst1.8      {q1},[r9],r3
    sub         r12,r12,#16                 @move to 16th value pointer

    vdup.8      q4,d0[0]
    vst1.8      {q2},[r2],r3
    vst1.8      {q2},[r9],r3

    subs        r4,r4,#16                   @decrement the loop count by 16
    vst1.8      {q3},[r2],r3
    vst1.8      {q3},[r9],r3

    vst1.8      {q4},[r2],r3
    vst1.8      {q4},[r9],r3
    bgt         core_loop_32
    vpop        {d8 - d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    b           end_func

core_loop_16:
    ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
    vld1.8      {q15},[r12]                 @pu1_ref[two_nt + 1 + col]

    vdup.8      d28,lr
    sub         r12,r12,#17
    vld1.8      {q0},[r12]
    vdup.8      d26,d1[7]
    vmovl.u8    q13,d26

    vdup.8      q1,d1[6]
    vsubl.u8    q12,d30,d28

    vdup.8      q2,d1[5]
    vshr.s16    q12,q12,#1

    vdup.8      q3,d1[4]
    vqadd.s16   q11,q13,q12

    vdup.8      q4,d1[3]
    vqmovun.s16 d22,q11

    vst1.8      {d22},[r2]!

    vdup.8      q5,d1[2]
    vsubl.u8    q12,d31,d28

    vdup.8      q6,d1[1]
    vshr.s16    q12,q12,#1

    vdup.8      q7,d1[0]
    vqadd.s16   q11,q13,q12

    vdup.8      q8,d0[7]
    vqmovun.s16 d22,q11

    vst1.8      {d22},[r2],r3
    sub         r2,r2,#8

    vst1.8      {q1},[r2],r3

    vst1.8      {q2},[r2],r3
    vst1.8      {q3},[r2],r3
    vst1.8      {q4},[r2],r3

    vdup.8      q1,d0[6]
    vst1.8      {q5},[r2],r3

    vdup.8      q2,d0[5]
    vst1.8      {q6},[r2],r3

    vdup.8      q3,d0[4]
    vst1.8      {q7},[r2],r3

    vdup.8      q4,d0[3]
    vst1.8      {q8},[r2],r3

    vdup.8      q5,d0[2]
    vst1.8      {q1},[r2],r3

    vdup.8      q6,d0[1]
    vst1.8      {q2},[r2],r3

    vdup.8      q7,d0[0]
    vst1.8      {q3},[r2],r3

    vst1.8      {q4},[r2],r3
    vst1.8      {q5},[r2],r3
    vst1.8      {q6},[r2],r3
    vst1.8      {q7},[r2],r3
    vpop        {d8 - d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    b           end_func


core_loop_8:
    ldrb        lr,[r12]                    @pu1_ref[two_nt]
    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]

    sub         r12,r12,#9
    vld1.8      {d0},[r12]
    vdup.8      d26,d0[7]
    vdup.8      d28,lr

    vdup.8      d3,d0[6]
    vmovl.u8    q13,d26

    vdup.8      d4,d0[5]
    vsubl.u8    q12,d30,d28

    vdup.8      d5,d0[4]
    vshr.s16    q12,q12,#1

    vdup.8      d6,d0[3]
    vqadd.s16   q11,q13,q12

    vdup.8      d7,d0[2]
    vqmovun.s16 d22,q11

    vst1.8      {d22},[r2],r3
    vst1.8      {d3},[r2],r3

    vdup.8      d8,d0[1]
    vst1.8      {d4},[r2],r3
    vst1.8      {d5},[r2],r3

    vdup.8      d9,d0[0]
    vst1.8      {d6},[r2],r3
    vst1.8      {d7},[r2],r3

    vst1.8      {d8},[r2],r3
    vst1.8      {d9},[r2],r3
    vpop        {d8 - d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    b           end_func


core_loop_4:
    ldrb        lr,[r12]                    @pu1_ref[two_nt]
    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]

    sub         r12,r12,#5
    vld1.8      {d0},[r12]
    vdup.8      d28,lr
    vdup.8      d26,d0[3]
    vmovl.u8    q13,d26

    vdup.8      d3,d0[2]
    vsubl.u8    q12,d30,d28

    vdup.8      d4,d0[1]
    vshr.s16    q12,q12,#1

    vdup.8      d5,d0[0]
    vqadd.s16   q11,q13,q12

    vqmovun.s16 d22,q11

    vst1.32     {d22[0]},[r2],r3
    vst1.32     {d3[0]},[r2],r3
    vst1.32     {d4[0]},[r2],r3
    vst1.32     {d5[0]},[r2],r3
    vpop        {d8 - d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
end_func: