///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** ///******************************************************************************* //* //file //* ihevcd_itrans_recon_dc_luma.s //* //* //brief //* contains function definitions itrans and recon for dc only case //* //* //author //* ittiam //* //* //par list of functions: //* //* //* //remarks //* none //* //*******************************************************************************/ .text .include "ihevc_neon_macros.s" .globl ihevcd_itrans_recon_dc_luma_av8 .type ihevcd_itrans_recon_dc_luma_av8, %function ihevcd_itrans_recon_dc_luma_av8: //void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred, // uword8 *pu1_dst, // word32 pred_strd, // word32 dst_strd, // word32 log2_trans_size, // word16 i2_coeff_value) //x0:pu1_pred //x1:pu1_dest //x2:pred_strd //x3:dst_strd stp x19, x20,[sp,#-16]! sxth x5,w5 mov x10,#1 lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)// mov x6,#64 // 1 << (shift1 - 1)// mov x7,#2048 // 1<<(shift2-1) add x8,x6,x5,lsl #6 asr x20, x8, #7 mov x19, #32767 cmp x20,x19 blt lbl37 mov x8,#32767 b lbl37_1 lbl37: mov x19,#-32768 cmp x20,x19 csel x8, x19, x20, lt lbl37_1: add x5,x7,x8,lsl #6 asr x20, x5, #12 mov x19,#32767 cmp x20,x19 blt lbl39 mov x6,#32767 b lbl39_1 lbl39: mov x19,#-32768 cmp x20,x19 csel x6, x19, x20, lt lbl39_1: mov x9,x4 mov x8,x4 // x6 has the dc_value // x4 has the trans_size value // x8 has the row value // x9 has the col value dup v0.8h,w6 cmp x4,#4 beq row_loop_4 row_loop: mov x9,x4 col_loop: mov x7,x0 ld1 {v2.8b},[x7],x2 ld1 {v3.8b},[x7],x2 ld1 {v4.8b},[x7],x2 ld1 {v5.8b},[x7],x2 ld1 {v6.8b},[x7],x2 ld1 {v7.8b},[x7],x2 ld1 {v1.8b},[x7],x2 ld1 {v17.8b},[x7] add x0,x0,#8 uaddw v30.8h, v0.8h , v2.8b uaddw v28.8h, v0.8h , v3.8b uaddw v26.8h, v0.8h , v4.8b uaddw v24.8h, v0.8h , v5.8b uaddw v22.8h, v0.8h , v6.8b uaddw v20.8h, v0.8h , v7.8b uaddw v18.8h, v0.8h , v1.8b uaddw v16.8h, v0.8h , v17.8b mov x11,x1 sqxtun v2.8b, v30.8h sqxtun v3.8b, v28.8h sqxtun v4.8b, v26.8h sqxtun v5.8b, v24.8h sqxtun v6.8b, v22.8h sqxtun v7.8b, v20.8h sqxtun v1.8b, v18.8h sqxtun v17.8b, v16.8h st1 {v2.2s},[x11],x3 st1 {v3.2s},[x11],x3 st1 {v4.2s},[x11],x3 st1 {v5.2s},[x11],x3 st1 {v6.2s},[x11],x3 st1 {v7.2s},[x11],x3 st1 {v1.2s},[x11],x3 st1 {v17.2s},[x11] add x1,x1,#8 subs x9,x9,#8 bgt col_loop subs x8,x8,#8 add x0,x0,x2,lsl #3 add x1,x1,x3,lsl #3 sub x0,x0,x4 sub x1,x1,x4 bgt row_loop b end_loops row_loop_4: mov x9,x10 col_loop_4: ld1 {v2.8b},[x0],x2 ld1 {v3.8b},[x0],x2 ld1 {v4.8b},[x0],x2 ld1 {v5.8b},[x0] uaddw v30.8h, v0.8h , v2.8b uaddw v28.8h, v0.8h , v3.8b uaddw v26.8h, v0.8h , v4.8b uaddw v24.8h, v0.8h , v5.8b sqxtun v2.8b, v30.8h sqxtun v3.8b, v28.8h sqxtun v4.8b, v26.8h sqxtun v5.8b, v24.8h st1 {v2.s}[0],[x1],x3 st1 {v3.s}[0],[x1],x3 st1 {v4.s}[0],[x1],x3 st1 {v5.s}[0],[x1] end_loops: ldp x19, x20,[sp],#16 ret