///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//*  ihevcd_fmt_conv_420sp_to_420p.s
//*
//* //brief
//*  contains function definitions for format conversions
//*
//* //author
//*  ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************/

.text

.include "ihevc_neon_macros.s"




///*****************************************************************************
//*                                                                            *
//*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
//*                                                                            *
//*  Description      : This function conversts the image from YUV420sP color  *
//*                     space to 420SP color space(UV interleaved).                 *
//*                                                                            *
//*  Arguments        : x0           pu1_src_y                                 *
//*                     x1           pu1_src_uv                                *
//*                     x2           pu1_dest_y                                *
//*                     x3           pu1_dest_u                               *
//*                     [x13 #40]    pu1_dest_v                               *
//*                     [x13 #44]    u2_width                                 *
//*                     [x13 #48]    u2_height                                   *
//*                     [x13 #52]    u2_stridey                                *
//*                     [x13 #56]    u2_strideuv                               *
//*                     [x13 #60]    u2_dest_stridey                           *
//*                     [x13 #64]    u2_dest_strideuv                          *
//*                     [x13 #68]    is_u_first                                *
//*                     [x13 #72]    disable_luma_copy                         *
//*                                                                            *
//*  Values Returned  : None                                                   *
//*                                                                            *
//*  Register Usage   : x0 - x14                                               *
//*                                                                            *
//*  Stack Usage      : 40 Bytes                                               *
//*                                                                            *
//*  Interruptibility : Interruptible                                          *
//*                                                                            *
//*  Known Limitations                                                         *
//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
//*                     Image Height:    Assumed to be even.                   *
//*                                                                            *
//*  Revision History :                                                        *
//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
//*         16 05 2012   Naveen SR     draft                                     *
//*                                                                            *
//*****************************************************************************/

.globl ihevcd_fmt_conv_420sp_to_420p_av8

.type ihevcd_fmt_conv_420sp_to_420p_av8, %function

ihevcd_fmt_conv_420sp_to_420p_av8:
    // STMFD sp!,{x4-x12, x14}
    push_v_regs
    stp         x19, x20,[sp,#-16]!
    mov         x15, x4
    mov         x8, x5                      ////Load u2_width
    mov         x9, x6                      ////Load u2_height

    LDR         w5, [sp,#88]                ////Load u2_dest_stridey
    sxtw        x5,w5
//    LDR        x6,[sp,#80]                @//Load u2_strideuv

    SUB         x10,x7,x8                   //// Src Y increment
    SUB         x11,x5,x8                   //// Dst Y increment

    LDR         w5, [sp,#112]               ////Load disable_luma_copy flag
    sxtw        x5,w5
    CMP         x5,#0                       ////skip luma if disable_luma_copy is non-zero
    BNE         uv_copy_start

    ///* Copy Y */

    MOV         x4,x9                       //// Copying height
y_row_loop:
    MOV         x6,x8                       //// Copying width

y_col_loop:

    SUB         x6,x6,#16
    ld1         {v0.8b, v1.8b},[x0],#16
    st1         {v0.8b, v1.8b},[x2],#16
    CMP         x6,#16
    BGE         y_col_loop
    CMP         x6,#0
    BEQ         y_col_loop_end
    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    ////Ex if width is 162, above loop will process 160 pixels. And
    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    //// and written using VLD1 and VST1
    sub         x20,x6,#16
    neg         x6, x20
    SUB         x0,x0,x6
    SUB         x2,x2,x6
    ld1         {v0.8b, v1.8b}, [x0],#16
    st1         {v0.8b, v1.8b}, [x2],#16

y_col_loop_end:
    ADD         x0, x0, x10
    ADD         x2, x2, x11
    SUBS        x4, x4, #1
    BGT         y_row_loop


    ///* Copy UV */
uv_copy_start:

    LDR         w5, [sp,#96]                ////Load u2_dest_strideuv
    sxtw        x5,w5
    LDR         w7, [sp,#80]                ////Load u2_strideuv
    sxtw        x7,w7

    LSR         x9, x9, #1                  //// height/2
//    MOV     x8,x8,LSR #1            @// Width/2

    SUB         x10,x7,x8                   //// Src UV increment
    LSR         x11, x8, #1
    SUB         x11,x5,x11                  //// Dst U and V increment

    mov         x5, x15                     ////Load pu1_dest_v

    LDR         w4, [sp,#104]               ////Load is_u_first_flag
    sxtw        x4,w4
    CMP         x4,#0                       ////Swap U and V dest if is_u_first_flag is zero
    csel        x4, x5, x4,EQ
    csel        x5, x3, x5,EQ
    csel        x3, x4, x3,EQ

    MOV         x4,x9                       //// Copying height
uv_row_loop:
    MOV         x6,x8                       //// Copying width

uv_col_loop:

    SUB         x6,x6,#16

    prfm        PLDL1KEEP,[x1,#128]
    ld2         {v0.8b, v1.8b},[x1],#16
    ST1         {v0.8b},[x3],#8
    ST1         {v1.8b},[x5],#8
    CMP         x6,#16
    BGE         uv_col_loop
    CMP         x6,#0
    BEQ         uv_col_loop_end
    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    ////Ex if width is 162, above loop will process 160 pixels. And
    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    //// and written using VLD1 and VST1
    sub         x20,x6,#16
    neg         x6, x20
    SUB         x1,x1,x6
    SUB         x3,x3,x6,LSR #1
    SUB         x5,x5,x6,LSR #1
    ld2         {v0.8b, v1.8b}, [x1],#16
    ST1         {v0.8b},[x3],#8
    ST1         {v1.8b},[x5],#8
uv_col_loop_end:
    ADD         x1, x1, x10
    ADD         x3, x3, x11
    ADD         x5, x5, x11
    SUBS        x4, x4, #1
    BGT         uv_row_loop

exit:
    // LDMFD sp!,{x4-x12, pc}
    ldp         x19, x20,[sp],#16
    pop_v_regs
    ret