@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@*  ihevcd_fmt_conv_420sp_to_420p.s
@*
@* @brief
@*  contains function definitions for format conversions
@*
@* @author
@*  ittiam
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************/








.text





@/*****************************************************************************
@*                                                                            *
@*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
@*                                                                            *
@*  Description      : This function conversts the image from YUV420sP color  *
@*                     space to 420SP color space(UV interleaved).            *
@*                                                                            *
@*  Arguments        : R0           pu1_src_y                                 *
@*                     R1           pu1_src_uv                                *
@*                     R2           pu1_dest_y                                *
@*                     R3           pu1_dest_u                               *
@*                     [R13 #40]    pu1_dest_v                               *
@*                     [R13 #44]    u2_width                                 *
@*                     [R13 #48]    u2_height                                   *
@*                     [R13 #52]    u2_stridey                                *
@*                     [R13 #56]    u2_strideuv                               *
@*                     [R13 #60]    u2_dest_stridey                           *
@*                     [R13 #64]    u2_dest_strideuv                          *
@*                     [R13 #68]    is_u_first                                *
@*                     [R13 #72]    disable_luma_copy                         *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R14                                               *
@*                                                                            *
@*  Stack Usage      : 40 Bytes                                               *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         16 05 2012   Naveen SR     draft                                   *
@*                                                                            *
@*****************************************************************************/

.globl ihevcd_fmt_conv_420sp_to_420p_a9q

.type ihevcd_fmt_conv_420sp_to_420p_a9q, %function

ihevcd_fmt_conv_420sp_to_420p_a9q:
    STMFD       sp!,{r4-r12, lr}

    LDR         r5,[sp,#60]                 @//Load u2_dest_stridey
@   LDR     r6,[sp,#56]             @//Load u2_strideuv
    LDR         r7,[sp,#52]                 @//Load u2_stridey
    LDR         r8,[sp,#44]                 @//Load u2_width
    LDR         r9,[sp,#48]                 @//Load u2_height

    SUB         r10,r7,r8                   @// Src Y increment
    SUB         r11,r5,r8                   @// Dst Y increment

    LDR         r5,[sp,#72]                 @//Load disable_luma_copy flag
    CMP         r5,#0                       @//skip luma if disable_luma_copy is non-zero
    BNE         uv_copy_start

    @/* Copy Y */

    MOV         r4,r9                       @// Copying height
y_row_loop:
    MOV         r6,r8                       @// Copying width

y_col_loop:

    SUB         r6,r6,#16
    vld1.8      {d0,d1},[r0]!
    vst1.8      {d0,d1},[r2]!
    CMP         r6,#16
    BGE         y_col_loop
    CMP         r6,#0
    BEQ         y_col_loop_end
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    RSB         r6,r6,#16
    SUB         r0,r0,r6
    SUB         r2,r2,r6
    vld1.8      {d0,d1}, [r0]!
    vst1.8      {d0,d1}, [r2]!

y_col_loop_end:
    ADD         r0, r0, r10
    ADD         r2, r2, r11
    SUBS        r4, r4, #1
    BGT         y_row_loop


    @/* Copy UV */
uv_copy_start:

    LDR         r5,[sp,#64]                 @//Load u2_dest_strideuv
    LDR         r7,[sp,#56]                 @//Load u2_strideuv

    MOV         r9,r9,LSR #1                @// height/2
@   MOV     r8,r8,LSR #1            @// Width/2

    SUB         r10,r7,r8                   @// Src UV increment
    MOV         r11,r8,LSR #1
    SUB         r11,r5,r11                  @// Dst U and V increment

    LDR         r5,[sp,#40]                 @//Load pu1_dest_v

    LDR         r4,[sp,#68]                 @//Load is_u_first_flag
    CMP         r4,#0                       @//Swap U and V dest if is_u_first_flag is zero
    MOVEQ       r4,r5
    MOVEQ       r5,r3
    MOVEQ       r3,r4

    MOV         r4,r9                       @// Copying height
uv_row_loop:
    MOV         r6,r8                       @// Copying width

uv_col_loop:

    SUB         r6,r6,#16

    PLD         [r1,#128]
    vld2.8      {d0,d1},[r1]!
    VST1.8      D0,[r3]!
    VST1.8      D1,[r5]!
    CMP         r6,#16
    BGE         uv_col_loop
    CMP         r6,#0
    BEQ         uv_col_loop_end
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    RSB         r6,r6,#16
    SUB         r1,r1,r6
    SUB         r3,r3,r6,LSR #1
    SUB         r5,r5,r6,LSR #1
    vld2.8      {d0,d1}, [r1]!
    VST1.8      D0, [r3]!
    VST1.8      D1, [r5]!
uv_col_loop_end:
    ADD         r1, r1, r10
    ADD         r3, r3, r11
    ADD         r5, r5, r11
    SUBS        r4, r4, #1
    BGT         uv_row_loop

exit:
    LDMFD       sp!,{r4-r12, pc}