@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

@/*
@//----------------------------------------------------------------------------
@// File Name            : impeg2_format_conv.s
@//
@// Description          : This file has the Idct Implementations for the
@//                        MPEG4 SP decoder on neon platform.
@//
@// Reference Document   :
@//
@// Revision History     :
@//      Date            Author                  Detail Description
@//   ------------    ----------------    ----------------------------------
@//   Jul 07, 2008     Naveen Kumar T                Created
@//
@//-------------------------------------------------------------------------
@*/

@/*
@// ----------------------------------------------------------------------------
@// Include Files
@// ----------------------------------------------------------------------------
@*/
.text
.p2align 2
.equ log2_16 ,  4
.equ log2_2  ,  1
@/*
@// ----------------------------------------------------------------------------
@// Struct/Union Types and Define
@// ----------------------------------------------------------------------------
@*/

@/*
@// ----------------------------------------------------------------------------
@// Static Global Data section variables
@// ----------------------------------------------------------------------------
@*/
@//--------------------------- NONE --------------------------------------------

@/*
@// ----------------------------------------------------------------------------
@// Static Prototype Functions
@// ----------------------------------------------------------------------------
@*/
@// -------------------------- NONE --------------------------------------------

@/*
@// ----------------------------------------------------------------------------
@// Exported functions
@// ----------------------------------------------------------------------------
@*/

@/*****************************************************************************
@*                                                                            *
@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q()                      *
@*                                                                            *
@*  Description      : This function conversts the image from YUV420P color   *
@*                     space to 420SP color space(UV interleaved).        *
@*                                                                            *
@*  Arguments        : R0           pu1_y                                     *
@*                     R1           pu1_u                                     *
@*                     R2           pu1_v                                     *
@*                     R3           pu1_dest_y                                *
@*                     [R13 #40]    pu1_dest_uv                               *
@*                     [R13 #44]    u2_height                                 *
@*                     [R13 #48]    u2_width                                  *
@*                     [R13 #52]    u2_stridey                                *
@*                     [R13 #56]    u2_strideu                                *
@*                     [R13 #60]    u2_stridev                                *
@*                     [R13 #64]    u2_dest_stride_y                          *
@*                     [R13 #68]    u2_dest_stride_uv                         *
@*                     [R13 #72]    convert_uv_only                           *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R8, Q0                                            *
@*                                                                            *
@*  Stack Usage      : 24 Bytes                                               *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
@*                     greater than or equal to 16                *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         07 06 2010   Varshita        Draft                                 *
@*         07 06 2010   Naveen Kr T     Completed                             *
@*                                                                            *
@*****************************************************************************/
                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:

    @// push the registers on the stack
    stmfd           sp!, {r4-r8, lr}

    ldr             r4, [sp, #56]       @// Load convert_uv_only

    cmp             r4, #1
    beq             yuv420sp_uv_chroma
    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr             r4, [sp, #28]       @// Load u2_height from stack

    ldr             r5, [sp, #32]       @// Load u2_width from stack

    ldr             r7, [sp, #36]       @// Load u2_stridey from stack

    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack

    sub             r7, r7, r5          @// Source increment

    sub             r8, r8, r5          @// Destination increment


yuv420sp_uv_row_loop_y:
    mov             r6, r5

yuv420sp_uv_col_loop_y:
    pld             [r0, #128]
    vld1.8          {q0}, [r0]!
    vst1.8          {q0}, [r3]!
    sub             r6, r6, #16
    cmp             r6, #15
    bgt             yuv420sp_uv_col_loop_y

    cmp             r6, #0
    beq             yuv420sp_uv_row_loop_end_y
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb             r6, r6, #16
    sub             r0, r0, r6
    sub             r3, r3, r6

    vld1.8          {q0}, [r0]!
    vst1.8          {q0}, [r3]!

yuv420sp_uv_row_loop_end_y:
    add             r0, r0, r7
    add             r3, r3, r8
    subs            r4, r4, #1
    bgt             yuv420sp_uv_row_loop_y

yuv420sp_uv_chroma:

    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack

    ldr             r4, [sp, #28]       @// Load u2_height from stack

    ldr             r5, [sp, #32]       @// Load u2_width from stack


    ldr             r7, [sp, #40]       @// Load u2_strideu from stack

    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack

    sub             r7, r7, r5, lsr #1  @// Source increment

    sub             r8, r8, r5          @// Destination increment

    mov             r5, r5, lsr #1
    mov             r4, r4, lsr #1
    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
yuv420sp_uv_row_loop_uv:
    mov             r6, r5


yuv420sp_uv_col_loop_uv:
    pld             [r1, #128]
    pld             [r2, #128]
    vld1.8          d0, [r1]!
    vld1.8          d1, [r2]!
    vst2.8          {d0, d1}, [r3]!
    sub             r6, r6, #8
    cmp             r6, #7
    bgt             yuv420sp_uv_col_loop_uv

    cmp             r6, #0
    beq             yuv420sp_uv_row_loop_end_uv
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb             r6, r6, #8
    sub             r1, r1, r6
    sub             r2, r2, r6
    sub             r3, r3, r6, lsl #1

    vld1.8          d0, [r1]!
    vld1.8          d1, [r2]!
    vst2.8          {d0, d1}, [r3]!

yuv420sp_uv_row_loop_end_uv:
    add             r1, r1, r7
    add             r2, r2, r7
    add             r3, r3, r8
    subs            r4, r4, #1
    bgt             yuv420sp_uv_row_loop_uv
    @//POP THE REGISTERS
    ldmfd           sp!, {r4-r8, pc}





@/*****************************************************************************
@*                                                                            *
@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q()                      *
@*                                                                            *
@*  Description      : This function conversts the image from YUV420P color   *
@*                     space to 420SP color space(VU interleaved).        *
@*             This function is similar to above function         *
@*             IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
@*             VLD1.8 for chroma - order of registers is different    *
@*                                                                            *
@*  Arguments        : R0           pu1_y                                     *
@*                     R1           pu1_u                                     *
@*                     R2           pu1_v                                     *
@*                     R3           pu1_dest_y                                *
@*                     [R13 #40]    pu1_dest_uv                               *
@*                     [R13 #44]    u2_height                                 *
@*                     [R13 #48]    u2_width                                  *
@*                     [R13 #52]    u2_stridey                                *
@*                     [R13 #56]    u2_strideu                                *
@*                     [R13 #60]    u2_stridev                                *
@*                     [R13 #64]    u2_dest_stride_y                          *
@*                     [R13 #68]    u2_dest_stride_uv                         *
@*                     [R13 #72]    convert_uv_only                           *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R8, Q0                                            *
@*                                                                            *
@*  Stack Usage      : 24 Bytes                                               *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
@*                     greater than or equal to 16                *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         07 06 2010   Varshita        Draft                                 *
@*         07 06 2010   Naveen Kr T     Completed                             *
@*                                                                            *
@*****************************************************************************/

                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:

    @// push the registers on the stack
    stmfd           sp!, {r4-r8, lr}

    ldr             r4, [sp, #56]       @// Load convert_uv_only

    cmp             r4, #1
    beq             yuv420sp_vu_chroma

    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr             r4, [sp, #28]       @// Load u2_height from stack

    ldr             r5, [sp, #32]       @// Load u2_width from stack

    ldr             r7, [sp, #36]       @// Load u2_stridey from stack

    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack

    sub             r7, r7, r5          @// Source increment

    sub             r8, r8, r5          @// Destination increment


yuv420sp_vu_row_loop_y:
    mov             r6, r5

yuv420sp_vu_col_loop_y:
    pld             [r0, #128]
    vld1.8          {q0}, [r0]!
    vst1.8          {q0}, [r3]!
    sub             r6, r6, #16
    cmp             r6, #15
    bgt             yuv420sp_vu_col_loop_y

    cmp             r6, #0
    beq             yuv420sp_vu_row_loop_end_y
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb             r6, r6, #16
    sub             r0, r0, r6
    sub             r3, r3, r6

    vld1.8          {q0}, [r0]!
    vst1.8          {q0}, [r3]!

yuv420sp_vu_row_loop_end_y:
    add             r0, r0, r7
    add             r3, r3, r8
    subs            r4, r4, #1
    bgt             yuv420sp_vu_row_loop_y

yuv420sp_vu_chroma:

    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack

    ldr             r4, [sp, #28]       @// Load u2_height from stack

    ldr             r5, [sp, #32]       @// Load u2_width from stack


    ldr             r7, [sp, #40]       @// Load u2_strideu from stack

    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack

    sub             r7, r7, r5, lsr #1  @// Source increment

    sub             r8, r8, r5          @// Destination increment

    mov             r5, r5, lsr #1
    mov             r4, r4, lsr #1
    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
yuv420sp_vu_row_loop_uv:
    mov             r6, r5


yuv420sp_vu_col_loop_uv:
    pld             [r1, #128]
    pld             [r2, #128]
    vld1.8          d1, [r1]!
    vld1.8          d0, [r2]!
    vst2.8          {d0, d1}, [r3]!
    sub             r6, r6, #8
    cmp             r6, #7
    bgt             yuv420sp_vu_col_loop_uv

    cmp             r6, #0
    beq             yuv420sp_vu_row_loop_end_uv
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb             r6, r6, #8
    sub             r1, r1, r6
    sub             r2, r2, r6
    sub             r3, r3, r6, lsl #1

    vld1.8          d1, [r1]!
    vld1.8          d0, [r2]!
    vst2.8          {d0, d1}, [r3]!

yuv420sp_vu_row_loop_end_uv:
    add             r1, r1, r7
    add             r2, r2, r7
    add             r3, r3, r8
    subs            r4, r4, #1
    bgt             yuv420sp_vu_row_loop_uv
    @//POP THE REGISTERS
    ldmfd           sp!, {r4-r8, pc}