///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_intra_pred_chroma_horz_neon.s
//*
//* @brief
//*  contains function definition for intra prediction  interpolation filters
//*
//*
//* @author
//*  parthiban v
//*
//* @par list of functions:
//*  - ihevc_intra_pred_luma_horz()
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
//
///**
//*******************************************************************************
//*
//* @brief
//*     intra prediction interpolation filter for horizontal luma variable.
//*
//* @par description:
//*      horizontal intraprediction(mode 10) with.extern  samples location
//*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
//*      to section 8.4.4.2.6 in the standard (special case)
//*
//* @param[in] pu1_src
//*  uword8 pointer to the source
//*
//* @param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] nt
//*  integer transform block size
//*
//* @param[in] mode
//*  integer intraprediction mode
//*
//* @returns
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
//void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
//                                  word32 src_strd,
//                                  uword8 *pu1_dst,
//                                  word32 dst_strd,
//                                  word32 nt,
//                                  word32 mode)
//**************variables vs registers*****************************************
//x0 => *pu1_ref
//x1 =>  src_strd
//x2 => *pu1_dst
//x3 =>  dst_strd

.text
.align 4
.include "ihevc_neon_macros.s"


.globl ihevc_intra_pred_chroma_horz_av8

.type ihevc_intra_pred_chroma_horz_av8, %function

ihevc_intra_pred_chroma_horz_av8:

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!

    lsl         x6,x4,#2                    //four_nt

    add         x12,x0,x6                   //*pu1_ref[four_nt]
    cmp         x4,#4                       //if nt == 4
    beq         core_loop_4

    cmp         x4,#8                       //if nt == 8
    beq         core_loop_8

    //cmp            x4,#16                            @if nt == 16
    //beq            core_loop_16

    sub         x12,x12,#16                 //move to 16th value pointer
    add         x9,x2,#16

core_loop_16:
    ld1         { v0.8h},[x12]              //load 16 values. d1[7] will have the 1st value.
    sub         x12,x12,#16
    ld1         { v18.8h},[x12]             //load 16 values. d1[7] will have the 1st value.

    dup         v2.8h, v0.h[7]              //duplicate the i value.

    dup         v4.8h, v0.h[6]              //duplicate the ii value.
    dup         v6.8h, v0.h[5]              //duplicate the iii value.
    st1         { v2.8h},[x2],x3            //store in 1st row 0-16 columns
    st1         { v2.8h},[x9],x3            //store in 1st row 16-32 columns

    dup         v1.8h, v0.h[4]
    st1         { v4.8h},[x2],x3
    st1         { v4.8h},[x9],x3

    dup         v2.8h, v0.h[3]
    st1         { v6.8h},[x2],x3
    st1         { v6.8h},[x9],x3

    dup         v4.8h, v0.h[2]
    st1         { v1.8h},[x2],x3
    st1         { v1.8h},[x9],x3

    dup         v6.8h, v0.h[1]
    st1         { v2.8h},[x2],x3
    st1         { v2.8h},[x9],x3

    dup         v1.8h, v0.h[0]
    st1         { v4.8h},[x2],x3
    st1         { v4.8h},[x9],x3

    dup         v2.8h, v18.h[7]
    st1         { v6.8h},[x2],x3
    st1         { v6.8h},[x9],x3

    dup         v4.8h, v18.h[6]
    st1         { v1.8h},[x2],x3
    st1         { v1.8h},[x9],x3

    dup         v6.8h, v18.h[5]
    st1         { v2.8h},[x2],x3
    st1         { v2.8h},[x9],x3

    dup         v1.8h, v18.h[4]
    st1         { v4.8h},[x2],x3
    st1         { v4.8h},[x9],x3

    dup         v2.8h, v18.h[3]
    st1         { v6.8h},[x2],x3
    st1         { v6.8h},[x9],x3

    dup         v4.8h, v18.h[2]
    st1         { v1.8h},[x2],x3
    st1         { v1.8h},[x9],x3

    dup         v6.8h, v18.h[1]
    st1         { v2.8h},[x2],x3
    st1         { v2.8h},[x9],x3
    sub         x12,x12,#16                 //move to 16th value pointer

    dup         v1.8h, v18.h[0]
    st1         { v4.8h},[x2],x3
    st1         { v4.8h},[x9],x3

    subs        x4,x4,#16                   //decrement the loop count by 16
    st1         { v6.8h},[x2],x3
    st1         { v6.8h},[x9],x3

    st1         { v1.8h},[x2],x3
    st1         { v1.8h},[x9],x3
    bgt         core_loop_16
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret
    b           endloop

core_loop_8:
    ldrb        w14,[x12],#1                //pu1_ref[two_nt]
    sxtw        x14,w14
    //vld1.8        {q15},[x12]                        @pu1_ref[two_nt + 1 + col]

    dup         v28.8b,w14
    sub         x12,x12,#17
    ld1         { v0.16b},[x12]

    sub         x12,x12,#16
//    ld1 { v30.16b},[x12]
    dup         v18.8h, v0.h[7]
    //vmovl.u8    q13,d26

    dup         v2.8h, v0.h[6]
    //vsubl.u8    q12,d30,d28

    dup         v4.8h, v0.h[5]
    //vshr.s16    q12,q12,#1

    dup         v6.8h, v0.h[4]
    //vqadd.s16    q11,q13,q12

    dup         v1.8h, v0.h[3]
    //vqmovun.s16 d22,q11

    st1         { v18.8h},[x2],x3

    dup         v18.8h, v0.h[2]
    //vsubl.u8    q12,d31,d28

    dup         v19.8h, v0.h[1]
    //vshr.s16    q12,q12,#1

    dup         v20.8h, v0.h[0]
    //vqadd.s16    q11,q13,q12

    dup         v16.8h, v0.h[3]
    //vqmovun.s16 d22,q11

    st1         { v2.8h},[x2],x3
    //sub            x2,x2,#8

    st1         { v4.8h},[x2],x3

    st1         { v6.8h},[x2],x3
    st1         { v1.8h},[x2],x3
    st1         { v18.8h},[x2],x3

    //vdup.8        q1,d0[2]
    st1         { v19.8h},[x2],x3

    //vdup.8        q2,d0[1]
    st1         { v20.8h},[x2],x3

    //vdup.8        q3,d0[0]
    //vst1.8        {q7},[x2],x3

    //vdup.8        q4,d0[3]
    //vst1.8        {q8},[x2],x3

    //vdup.8        q5,d0[2]
    //vst1.8        {q1},[x2],x3

    //vdup.8        q6,d0[1]
    //vst1.8        {q2},[x2],x3

    //vdup.8        q7,d0[0]
    //vst1.8        {q3},[x2],x3

    //vst1.8        {q4},[x2],x3
    //vst1.8        {q5},[x2],x3
    //vst1.8        {q6},[x2],x3
    //vst1.8        {q7},[x2],x3

    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret
    b           endloop


core_loop_4:
    ldrb        w14,[x12]                   //pu1_ref[two_nt]
    sxtw        x14,w14
    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    //vld1.8        {d30},[x12]                        @pu1_ref[two_nt + 1 + col]

    sub         x12,x12,#9
    ld1         {v0.8b},[x12]
    sub         x12,x12,#8
    ld1         {v30.8b},[x12]
    dup         v26.4h, v0.h[3]
    dup         v28.8b,w14

    dup         v3.4h, v0.h[2]
    uxtl        v26.8h, v26.8b

    dup         v4.4h, v0.h[1]
    usubl       v24.8h, v30.8b, v28.8b

    dup         v5.4h, v0.h[0]
    sshr        v24.8h, v24.8h,#1

    dup         v6.4h, v0.h[3]
    sqadd       v22.8h,  v26.8h ,  v24.8h

    dup         v7.4h, v0.h[2]
    sqxtun      v22.8b, v22.8h

    st1         {v6.8b},[x2],x3
    st1         {v3.8b},[x2],x3

    dup         v1.4h, v0.h[1]
    st1         {v4.8b},[x2],x3
    st1         {v5.8b},[x2],x3

    dup         v17.4h, v0.h[0]
    //vst1.8        {d6},[x2],x3
    //vst1.8        {d7},[x2],x3

    //vst1.8        {d8},[x2],x3
    //vst1.8        {d9},[x2],x3
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret
    b           endloop


//core_loop_4
    ldrb        w14,[x12]                   //pu1_ref[two_nt]
    sxtw        x14,w14
    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]

    sub         x12,x12,#5
    ld1         {v0.8b},[x12]
    dup         v28.8b,w14
    dup         v26.8b, v0.b[3]
    uxtl        v26.8h, v26.8b

    dup         v3.8b, v0.b[2]
    usubl       v24.8h, v30.8b, v28.8b

    dup         v4.8b, v0.b[1]
    sshr        v24.8h, v24.8h,#1

    dup         v5.8b, v0.b[0]
    sqadd       v22.8h,  v26.8h ,  v24.8h

    sqxtun      v22.8b, v22.8h

    st1         {v22.s}[0],[x2],x3
    st1         {v3.s}[0],[x2],x3
    st1         {v4.s}[0],[x2],x3
    st1         {v5.s}[0],[x2],x3

    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret

endloop: