@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@*  ihevcd_itrans_recon_dc_chroma.s
@*
@* @brief
@*  contains function definitions itrans and recon for dc only case
@*
@* @author
@*  ittiam
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************/

.text


.globl ihevcd_itrans_recon_dc_chroma_a9q

.type ihevcd_itrans_recon_dc_chroma_a9q, %function

ihevcd_itrans_recon_dc_chroma_a9q:

@void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
@                            uword8 *pu1_dst,
@                            word32 pred_strd,
@                            word32 dst_strd,
@                            word32 log2_trans_size,
@                            word16 i2_coeff_value)

@r0:pu1_pred
@r1:pu1_dest
@r2:pred_strd
@r3:dst_strd



    push        {r0-r11,lr}
    vpush       {d8-d15}
    ldr         r4,[sp,#0x74]               @loads log2_trans_size
    ldr         r5,[sp,#0x78]               @ loads i2_coeff_value
    mov         r10,#1
    lsl         r4,r10,r4                   @    trans_size = (1 << log2_trans_size)@
    mov         r6,#64 @ 1 << (shift1 - 1)@
    mov         r7,#2048                    @ 1<<(shift2-1)

    add         r8,r6,r5,lsl #6
    ssat        r8,#16,r8,asr #7
    add         r5,r7,r8,lsl #6
    ssat        r6,#16,r5,asr #12
    mov         r9,r4
    mov         r8,r4

    @ r6 has the dc_value
    @ r4 has the trans_size value
    @ r8 has the row value
    @ r9 has the col value
    vdup.s16    q0,r6
    cmp         r4,#4
    beq         row_loop_4chroma


row_loop_chroma:
    mov         r9,r4


col_loop_chroma:

    mov         r7,r0
    vld2.8      {d2,d3},[r7],r2
    vld2.8      {d4,d5},[r7],r2
    vld2.8      {d6,d7},[r7],r2
    vld2.8      {d8,d9},[r7],r2

    vld2.8      {d10,d11},[r7],r2
    vld2.8      {d12,d13},[r7],r2
    vld2.8      {d14,d15},[r7],r2
    vld2.8      {d16,d17},[r7]

    add         r0,r0,#16


    vaddw.u8    q15,q0,d2
    vaddw.u8    q14,q0,d4
    vaddw.u8    q13,q0,d6
    vaddw.u8    q12,q0,d8
    vaddw.u8    q11,q0,d10
    vaddw.u8    q10,q0,d12
    vaddw.u8    q9,q0,d14


    mov         r11,r1
    vqmovun.s16 d2,q15
    vqmovun.s16 d4,q14
    vqmovun.s16 d6,q13
    vqmovun.s16 d8,q12

    vaddw.u8    q15,q0,d16

    vqmovun.s16 d10,q11
    vqmovun.s16 d12,q10
    vqmovun.s16 d14,q9
    vqmovun.s16 d16,q15

    vst2.8      {d2,d3},[r11],r3
    vst2.8      {d4,d5},[r11],r3
    vst2.8      {d6,d7},[r11],r3
    vst2.8      {d8,d9},[r11],r3

    vst2.8      {d10,d11},[r11],r3
    vst2.8      {d12,d13},[r11],r3
    vst2.8      {d14,d15},[r11],r3
    vst2.8      {d16,d17},[r11]

    add         r1,r1,#16

    subs        r9,r9,#8
    bgt         col_loop_chroma

    subs        r8,r8,#8

    add         r0,r0,r2,lsl #3
    add         r1,r1,r3,lsl #3
    sub         r0,r0,r4,lsl #1
    sub         r1,r1,r4,lsl #1
    bgt         row_loop_chroma
    b           end_loops_chroma


row_loop_4chroma:
    mov         r9,r10


col_loop_4chroma:


    vld2.8      {d2,d3},[r0],r2
    vld2.8      {d4,d5},[r0],r2
    vld2.8      {d6,d7},[r0],r2
    vld2.8      {d8,d9},[r0]




    vaddw.u8    q15,q0,d2
    vaddw.u8    q14,q0,d4
    vaddw.u8    q13,q0,d6
    vaddw.u8    q12,q0,d8



    vqmovun.s16 d2,q15
    vqmovun.s16 d4,q14
    vqmovun.s16 d6,q13
    vqmovun.s16 d8,q12


    vzip.8      d2,d3
    vzip.8      d4,d5
    vzip.8      d6,d7
    vzip.8      d8,d9

    vst1.u32    {d2},[r1],r3
    vst1.u32    {d4},[r1],r3
    vst1.u32    {d6},[r1],r3
    vst1.u32    {d8},[r1]

end_loops_chroma:
    vpop        {d8-d15}
    pop         {r0-r11,pc}