@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@*  ihevcd_fmt_conv_420sp_to_rgba8888.s
@*
@* @brief
@*  contains function definitions for format conversions
@*
@* @author
@*  ittiam
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************/
    .equ DO1STROUNDING, 0

    @ ARM
    @
    @ PRESERVE8

.text
.p2align 2




@/*****************************************************************************
@*                                                                            *
@*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
@*                                                                            *
@*  Description      : This function conversts the image from YUV422 color    *
@*                     space to RGB888 color space. The function can be       *
@*                     invoked at the MB level.                               *
@*                                                                            *
@*  Arguments        : R0           pubY                                      *
@*                     R1           pubUV                                     *
@*                     R2           pusRGB                                    *
@*                     R3           pusRGB                                    *
@*                     [R13 #40]    usHeight                                  *
@*                     [R13 #44]    usWidth                                   *
@*                     [R13 #48]    usStrideY                                 *
@*                     [R13 #52]    usStrideU                                 *
@*                     [R13 #56]    usStrideV                                 *
@*                     [R13 #60]    usStrideRGB                               *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R14                                               *
@*                                                                            *
@*  Stack Usage      : 104 Bytes                                              *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
@*                     greater than or equal to 16                            *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         07 06 2010   Varshita        Draft                                 *
@*         07 06 2010   Naveen Kr T     Completed                             *
@*         05 08 2013   Naveen K P      Modified for HEVC                     *
@*         30 10 2018   Saurabh Sood    Store D registers to stack            *
@*****************************************************************************/
    .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
ihevcd_fmt_conv_420sp_to_rgba8888_a9q:

    @// push the registers on the stack
    STMFD       SP!,{R4-R12,LR}
    VPUSH       {d8-d15}

    @//R0 - Y PTR
    @//R1 - UV PTR
    @//R2 - RGB PTR
    @//R3 - RGB PTR
    @//R4 - PIC WIDTH
    @//R5 - PIC HT
    @//R6 - STRIDE Y
    @//R7 - STRIDE U
    @//R8 - STRIDE V
    @//R9 - STRIDE RGB

    @//ONE ROW PROCESSING AT A TIME

    @//THE FOUR CONSTANTS ARE:
    @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092

    @PLD        [R0]
    @PLD        [R1]
    @PLD        [R2]


    @/* can be loaded from a defined const type */
    MOVW        R10,#0x3311
    VMOV.16     D0[0],R10                   @//C1

    MOVW        R10,#0xF379
    VMOV.16     D0[1],R10                   @//C2

    MOVW        R10,#0xE5F8
    VMOV.16     D0[2],R10                   @//C3

    MOVW        R10,#0x4092
    VMOV.16     D0[3],R10                   @//C4

    @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
    MOV         R10,#128
    VDUP.8      D1,R10

    @//D0 HAS C1-C2-C3-C4
    @// load other parameters from stack
    LDR         R5,[sp,#104]
    @LDR  R4,[sp,#44]
    LDR         R6,[sp,#108]
    LDR         R7,[sp,#112]
    @LDR  R8,[sp,#52]
    LDR         R9,[sp,#116]

    @// calculate offsets, offset = stride - width
    SUB         R10,R6,R3                   @// luma offset
    SUB         R11,R7,R3
    @, LSR #1   @// u offset
    @SUB     R12,R8,R3, LSR #1  @// v offset
    SUB         R14,R9,R3                   @// rgb offset in pixels

    @// calculate height loop count
    MOV         R5,R5, LSR #1               @// height_cnt = height / 16

    @// create next row pointers for rgb and luma data
    ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
    ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride

LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:

    @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V

    @// calculate width loop count
    MOV         R6,R3, LSR #4               @// width_cnt = width / 16

    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    @//LOAD VALUES OF Y 8-BIT VALUES
    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

    SUBS        R6,R6,#1
    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP

LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
    @VMOV.I8 Q1,#128
    VUZP.8      D2,D3


    @//NEED TO SUBTRACT (U-128) AND (V-128)
    @//(D2-D1),(D3-D1)
    VSUBL.U8    Q2,D2,D1                    @//(U-128)
    VSUBL.U8    Q3,D3,D1                    @//(V-128)

    @//LOAD VALUES OF U&V for next row
    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V

    @PLD        [R0]
    PLD         [R1]

    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B

    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R

    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3

    @//NARROW RIGHT SHIFT BY 13 FOR R&B
    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
    @//Q4 - WEIGHT FOR B

    @//NARROW RIGHT SHIFT BY 13 FOR R&B
    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
    @//Q5 - WEIGHT FOR R

    @//NARROW RIGHT SHIFT BY 13 FOR G
    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    @//Q6 - WEIGHT FOR G

    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G

    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G

    VQMOVUN.S16 D14,Q7
    VQMOVUN.S16 D15,Q9
    VQMOVUN.S16 D16,Q8
    VMOV.I8     D17,#0

    VZIP.8      D14,D15
    VZIP.8      D16,D17
    VZIP.16     Q7,Q8


    VQMOVUN.S16 D20,Q10
    VQMOVUN.S16 D21,Q12
    VQMOVUN.S16 D22,Q11
    VMOV.I8     D23,#0

    VZIP.8      D20,D21
    VZIP.8      D22,D23
    VZIP.16     Q10,Q11

    VZIP.32     Q7,Q10
    VZIP.32     Q8,Q11

    VST1.32     D14,[R2]!
    VST1.32     D15,[R2]!
    VST1.32     D20,[R2]!
    VST1.32     D21,[R2]!
    VST1.32     D16,[R2]!
    VST1.32     D17,[R2]!
    VST1.32     D22,[R2]!
    VST1.32     D23,[R2]!

    @//D14-D20 - TOALLY HAVE 16 VALUES
    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G

    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G

    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    @//LOAD VALUES OF Y 8-BIT VALUES
    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

    PLD         [R0]
    PLD         [R7]

    VQMOVUN.S16 D14,Q7
    VQMOVUN.S16 D15,Q9
    VQMOVUN.S16 D16,Q8
    VMOV.I8     D17,#0

    VZIP.8      D14,D15
    VZIP.8      D16,D17
    VZIP.16     Q7,Q8


    VQMOVUN.S16 D20,Q10
    VQMOVUN.S16 D21,Q12
    VQMOVUN.S16 D22,Q11
    VMOV.I8     D23,#0

    VZIP.8      D20,D21
    VZIP.8      D22,D23
    VZIP.16     Q10,Q11

    VZIP.32     Q7,Q10
    VZIP.32     Q8,Q11

    VST1.32     D14,[R8]!
    VST1.32     D15,[R8]!
    VST1.32     D20,[R8]!
    VST1.32     D21,[R8]!
    VST1.32     D16,[R8]!
    VST1.32     D17,[R8]!
    VST1.32     D22,[R8]!
    VST1.32     D23,[R8]!

    SUBS        R6,R6,#1                    @// width_cnt -= 1
    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP

LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
    @VMOV.I8 Q1,#128
    VUZP.8      D2,D3


    @//NEED TO SUBTRACT (U-128) AND (V-128)
    @//(D2-D1),(D3-D1)
    VSUBL.U8    Q2,D2,D1                    @//(U-128)
    VSUBL.U8    Q3,D3,D1                    @//(V-128)


    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B

    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R

    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3

    @//NARROW RIGHT SHIFT BY 13 FOR R&B
    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
    @//Q4 - WEIGHT FOR B

    @//NARROW RIGHT SHIFT BY 13 FOR R&B
    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
    @//Q5 - WEIGHT FOR R

    @//NARROW RIGHT SHIFT BY 13 FOR G
    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    @//Q6 - WEIGHT FOR G

    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G

    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G

    VQMOVUN.S16 D14,Q7
    VQMOVUN.S16 D15,Q9
    VQMOVUN.S16 D16,Q8
    VMOV.I8     D17,#0

    VZIP.8      D14,D15
    VZIP.8      D16,D17
    VZIP.16     Q7,Q8


    VQMOVUN.S16 D20,Q10
    VQMOVUN.S16 D21,Q12
    VQMOVUN.S16 D22,Q11
    VMOV.I8     D23,#0

    VZIP.8      D20,D21
    VZIP.8      D22,D23
    VZIP.16     Q10,Q11

    VZIP.32     Q7,Q10
    VZIP.32     Q8,Q11

    VST1.32     D14,[R2]!
    VST1.32     D15,[R2]!
    VST1.32     D20,[R2]!
    VST1.32     D21,[R2]!
    VST1.32     D16,[R2]!
    VST1.32     D17,[R2]!
    VST1.32     D22,[R2]!
    VST1.32     D23,[R2]!

    @//D14-D20 - TOALLY HAVE 16 VALUES
    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G

    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G


    VQMOVUN.S16 D14,Q7
    VQMOVUN.S16 D15,Q9
    VQMOVUN.S16 D16,Q8
    VMOV.I8     D17,#0

    VZIP.8      D14,D15
    VZIP.8      D16,D17
    VZIP.16     Q7,Q8


    VQMOVUN.S16 D20,Q10
    VQMOVUN.S16 D21,Q12
    VQMOVUN.S16 D22,Q11
    VMOV.I8     D23,#0

    VZIP.8      D20,D21
    VZIP.8      D22,D23
    VZIP.16     Q10,Q11

    VZIP.32     Q7,Q10
    VZIP.32     Q8,Q11

    VST1.32     D14,[R8]!
    VST1.32     D15,[R8]!
    VST1.32     D20,[R8]!
    VST1.32     D21,[R8]!
    VST1.32     D16,[R8]!
    VST1.32     D17,[R8]!
    VST1.32     D22,[R8]!
    VST1.32     D23,[R8]!

    @// Adjust the address pointers
    ADD         R0,R7,R10                   @// luma = luma_next + offset
    ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset

    ADD         R7,R0,R3                    @// luma_next = luma + width
    ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width

    ADD         R1,R1,R11                   @// adjust u pointer
    @ADD        R2,R2,R12           @// adjust v pointer

    ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
    ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset

    SUBS        R5,R5,#1                    @// height_cnt -= 1

    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP

    @//POP THE REGISTERS
    VPOP        {d8-d15}
    LDMFD       SP!,{R4-R12,PC}


    .section .note.GNU-stack,"",%progbits