//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
//******************************************************************************
//* @file
//*  ih264_intra_pred_luma_16x16_av8.s
//*
//* @brief
//*  Contains function definitions for intra 16x16 Luma prediction .
//*
//* @author
//*  Ittiam
//*
//* @par List of Functions:
//*
//*  - ih264_intra_pred_luma_16x16_mode_vert_av8()
//*  - ih264_intra_pred_luma_16x16_mode_horz_av8()
//*  - ih264_intra_pred_luma_16x16_mode_dc_av8()
//*  - ih264_intra_pred_luma_16x16_mode_plane_av8()
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/

///* All the functions here are replicated from ih264_intra_pred_filters.c
//

///**
///**
///**
//


.text
.p2align 2
.include "ih264_neon_macros.s"
.extern ih264_gai1_intrapred_luma_plane_coeffs



///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_vert
//*
//* @brief
//*   Perform Intra prediction for  luma_16x16 mode:vertical
//*
//* @par Description:
//* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
//                                        UWORD8 *pu1_dst,
//                                        WORD32 src_strd,
//                                        WORD32 dst_strd,
//                                        WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//   x4 =>  ui_neighboravailability


    .global ih264_intra_pred_luma_16x16_mode_vert_av8

ih264_intra_pred_luma_16x16_mode_vert_av8:

    push_v_regs


    add       x0, x0, #17
    ld1       {v0.8b, v1.8b}, [x0]

    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3
    st1       {v0.8b, v1.8b}, [x1], x3

    pop_v_regs
    ret





///******************************************************************************


///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_horz
//*
//* @brief
//*  Perform Intra prediction for  luma_16x16 mode:horizontal
//*
//* @par Description:
//*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
//                                         UWORD8 *pu1_dst,
//                                         WORD32 src_strd,
//                                         WORD32 dst_strd,
//                                         WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//   x4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_16x16_mode_horz_av8

ih264_intra_pred_luma_16x16_mode_horz_av8:



    push_v_regs

    ld1       {v0.16b}, [x0]



    dup       v10.16b, v0.b[15]
    dup       v11.16b, v0.b[14]
    dup       v12.16b, v0.b[13]
    dup       v13.16b, v0.b[12]
    st1       {v10.16b}, [x1], x3
    dup       v14.16b, v0.b[11]
    st1       {v11.16b}, [x1], x3
    dup       v15.16b, v0.b[10]
    st1       {v12.16b}, [x1], x3
    dup       v16.16b, v0.b[9]
    st1       {v13.16b}, [x1], x3
    dup       v17.16b, v0.b[8]
    st1       {v14.16b}, [x1], x3
    dup       v18.16b, v0.b[7]
    st1       {v15.16b}, [x1], x3
    dup       v19.16b, v0.b[6]
    st1       {v16.16b}, [x1], x3
    dup       v20.16b, v0.b[5]
    st1       {v17.16b}, [x1], x3
    dup       v21.16b, v0.b[4]
    st1       {v18.16b}, [x1], x3
    dup       v22.16b, v0.b[3]
    st1       {v19.16b}, [x1], x3
    dup       v23.16b, v0.b[2]
    st1       {v20.16b}, [x1], x3
    dup       v24.16b, v0.b[1]
    st1       {v21.16b}, [x1], x3
    dup       v25.16b, v0.b[0]
    st1       {v22.16b}, [x1], x3
    st1       {v23.16b}, [x1], x3
    st1       {v24.16b}, [x1], x3
    st1       {v25.16b}, [x1], x3

    pop_v_regs
    ret







///******************************************************************************


///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_dc
//*
//* @brief
//*  Perform Intra prediction for  luma_16x16 mode:DC
//*
//* @par Description:
//*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
//                                       UWORD8 *pu1_dst,
//                                       WORD32 src_strd,
//                                       WORD32 dst_strd,
//                                       WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//   x4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_16x16_mode_dc_av8

ih264_intra_pred_luma_16x16_mode_dc_av8:



    push_v_regs
    stp       x19, x20, [sp, #-16]!

    sub       v0.16b, v0.16b, v0.16b
    sub       v1.16b, v1.16b, v1.16b
    mov       w10, #0
    mov       w11 , #3
    ands      x6, x4, #0x01
    beq       top_available             //LEFT NOT AVAILABLE
    ld1       {v0.16b}, [x0]
    add       w10, w10, #8
    add       w11, w11, #1
top_available:
    ands      x6, x4, #0x04
    beq       none_available
    add       x6, x0, #17
    ld1       {v1.16b}, [x6]
    add       w10, w10, #8
    add       w11, w11, #1
    b         summation
none_available:
    cmp       x4, #0
    bne       summation
    mov       w15, #128
    dup       v20.16b, w15
    b         store
summation:
    uaddl     v2.8h, v0.8b, v1.8b
    uaddl2    v3.8h, v0.16b, v1.16b
    dup       v10.8h, w10
    neg       w11, w11
    dup       v20.8h, w11
    add       v0.8h, v2.8h, v3.8h
    mov       v1.d[0], v0.d[1]
    add       v0.4h, v0.4h, v1.4h
    addp      v0.4h, v0.4h , v0.4h
    addp      v0.4h, v0.4h , v0.4h
    add       v0.4h, v0.4h, v10.4h
    uqshl     v0.8h, v0.8h, v20.8h
    sqxtun    v0.8b, v0.8h
    dup       v20.16b, v0.b[0]

store:

    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3
    st1       { v20.16b}, [x1], x3



end_func:

    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret





///******************************************************************************


///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_plane
//*
//* @brief
//*  Perform Intra prediction for  luma_16x16 mode:PLANE
//*
//* @par Description:
//*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] dst_strd
//*  integer destination stride
//*
//* @param[in] ui_neighboravailability
//*  availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
//                                        UWORD8 *pu1_dst,
//                                        WORD32 src_strd,
//                                        WORD32 dst_strd,
//                                        WORD32 ui_neighboravailability)

//**************Variables Vs Registers*****************************************
//    x0 => *pu1_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//   x4 =>  ui_neighboravailability

    .global ih264_intra_pred_luma_16x16_mode_plane_av8
ih264_intra_pred_luma_16x16_mode_plane_av8:

    push_v_regs
    stp       x19, x20, [sp, #-16]!
    mov       x2, x1
    add       x1, x0, #17
    add       x0, x0, #15
    mov       x8, #9
    sub       x1, x1, #1
    mov       x10, x1                   //top_left
    mov       x4, #-1
    ld1       {v2.2s}, [x1], x8

    adrp      x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
    ldr       x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]

    ld1       {v0.2s}, [x1]
    rev64     v2.8b, v2.8b
    ld1       {v6.2s, v7.2s}, [x7]
    usubl     v0.8h, v0.8b, v2.8b
    uxtl      v16.8h, v6.8b
    mul       v0.8h, v0.8h , v16.8h
    uxtl      v18.8h, v7.8b
    add       x7, x0, x4, lsl #3
    sub       x0, x7, x4, lsl #1
    sub       x20, x4, #0x0
    neg       x14, x20
    addp      v0.8h, v0.8h, v1.8h
    ldrb      w8, [x7], #-1
    sxtw      x8, w8
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    saddlp    v0.2s, v0.4h
    sub       x12, x8, x9
    ldrb      w8, [x7], #-1
    sxtw      x8, w8
    saddlp    v0.1d, v0.2s
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    sub       x8, x8, x9
    shl       v2.2s, v0.2s, #2
    add       x12, x12, x8, lsl #1
    add       v0.2s, v0.2s , v2.2s
    ldrb      w8, [x7], #-1
    sxtw      x8, w8
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
    sub       x8, x8, x9
    ldrb      w5, [x7], #-1
    sxtw      x5, w5
    add       x8, x8, x8, lsl #1
    dup       v4.8h, v0.h[0]
    add       x12, x12, x8
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    mul       v0.8h, v4.8h , v16.8h
    sub       x5, x5, x9
    mul       v2.8h, v4.8h , v18.8h
    add       x12, x12, x5, lsl #2
    ldrb      w8, [x7], #-1
    sxtw      x8, w8
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    sub       x8, x8, x9
    ldrb      w5, [x7], #-1
    sxtw      x5, w5
    add       x8, x8, x8, lsl #2
    ldrb      w6, [x0], #1
    sxtw      x6, w6
    add       x12, x12, x8
    ldrb      w8, [x7], #-1
    sxtw      x8, w8
    ldrb      w9, [x0], #1
    sxtw      x9, w9
    sub       x5, x5, x6
    sub       x8, x8, x9
    add       x5, x5, x5, lsl #1
    sub       x20, x8, x8, lsl #3
    neg       x8, x20
    add       x12, x12, x5, lsl #1
    ldrb      w5, [x7], #-1
    sxtw      x5, w5
    ldrb      w6, [x10]                 //top_left
    sxtw      x6, w6
    add       x12, x12, x8
    sub       x9, x5, x6
    ldrb      w6, [x1, #7]
    sxtw      x6, w6
    add       x12, x12, x9, lsl #3      // i_c = x12
    add       x8, x5, x6
    add       x12, x12, x12, lsl #2
    lsl       x8, x8, #4                // i_a = x8
    add       x12, x12, #0x20
    lsr       x12, x12, #6
    shl       v28.8h, v4.8h, #3
    dup       v6.8h, w12
    dup       v30.8h, w8
    shl       v26.8h, v6.8h, #3
    sub       v30.8h, v30.8h , v28.8h
    sub       v30.8h, v30.8h , v26.8h
    add       v28.8h, v30.8h , v6.8h
    add       v26.8h, v28.8h , v0.8h
    add       v28.8h, v28.8h , v2.8h
    sqrshrun  v20.8b, v26.8h, #5
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v20.8b, v26.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3
    sqrshrun  v21.8b, v28.8h, #5
    add       v26.8h, v26.8h , v6.8h
    add       v28.8h, v28.8h , v6.8h
    sqrshrun  v22.8b, v26.8h, #5
    st1       {v20.2s, v21.2s}, [x2], x3
    sqrshrun  v23.8b, v28.8h, #5
    st1       {v22.2s, v23.2s}, [x2], x3

end_func_plane:

    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret