///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** ///** //******************************************************************************* //* //* //brief //* interprediction luma function for copy //* //* //par description: //* copies the array of width 'wd' and height 'ht' from the location pointed //* by 'src' to the location pointed by 'dst' //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[out] pu1_dst //* uword8 pointer to the destination //* //* //param[in] src_strd //* integer source stride //* //* //param[in] dst_strd //* integer destination stride //* //* //param[in] pi1_coeff //* word8 pointer to the filter coefficients //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //void ihevc_inter_pred_luma_copy ( // uword8 *pu1_src, // uword8 *pu1_dst, // word32 src_strd, // word32 dst_strd, // word8 *pi1_coeff, // word32 ht, // word32 wd ) //**************variables vs registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x11 => ht // x16 => wd .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_inter_pred_luma_copy_av8 .type ihevc_inter_pred_luma_copy_av8, %function ihevc_inter_pred_luma_copy_av8: // stmfd sp!, {x8-x16, lr} //stack stores the values of the arguments stp x19,x20,[sp, #-16]! mov x16,x6 //loads wd mov x11,x5 //loads ht cmp x11,#0 //checks ht == 0 ble end_loops tst x16,#15 //checks wd for multiples for 4 & 8 beq core_loop_wd_16 tst x16,#7 //checks wd for multiples for 4 & 8 beq core_loop_wd_8 sub x15,x16,#4 outer_loop_wd_4: subs x8,x16,#0 //checks wd == 0 ble end_inner_loop_wd_4 inner_loop_wd_4: ld1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) add x9,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x3 //pu1_dst_tmp += dst_strd st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) add x0,x0,#4 //pu1_src += 4 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) subs x8,x8,#4 //(wd -4) st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) add x1,x1,#4 //pu1_dst += 4 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) bgt inner_loop_wd_4 end_inner_loop_wd_4: subs x11,x11,#4 //ht - 4 sub x0,x9,x15 //pu1_src = pu1_src_tmp sub x1,x10,x15 //pu1_dst = pu1_dst_tmp bgt outer_loop_wd_4 end_loops: // ldmfd sp!,{x8-x16,pc} //reload the registers from sp // MRS x20,PMCCFILTR_EL0 sub x0,x20,x19 ldp x19,x20,[sp],#16 ret core_loop_wd_8: sub x15,x16,#8 outer_loop_wd_8: subs x8,x16,#0 //checks wd ble end_inner_loop_wd_8 inner_loop_wd_8: add x9,x0,x2 //pu1_src_tmp += src_strd ld1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) add x10,x1,x3 //pu1_dst_tmp += dst_strd st1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) ld1 {v1.8b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v1.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) subs x8,x8,#8 //wd - 8(loop condition) ld1 {v2.8b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v2.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) ld1 {v3.8b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v3.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) bgt inner_loop_wd_8 end_inner_loop_wd_8: subs x11,x11,#4 //ht -= 4 sub x0,x9,x15 //pu1_src = pu1_src_tmp sub x1,x10,x15 //pu1_dst = pu1_dst_tmp bgt outer_loop_wd_8 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp // MRS x20,PMCCFILTR_EL0 sub x0,x20,x19 ldp x19,x20,[sp],#16 ret core_loop_wd_16: sub x15,x16,#16 outer_loop_wd_16: subs x8,x16,#0 //checks wd ble end_inner_loop_wd_16 inner_loop_wd_16: add x9,x0,x2 //pu1_src_tmp += src_strd ld1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) add x10,x1,x3 //pu1_dst_tmp += dst_strd st1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) ld1 {v1.16b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v1.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) subs x8,x8,#16 //wd - 8(loop condition) ld1 {v2.16b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v2.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) ld1 {v3.16b},[x9],x2 //vld1_u8(pu1_src_tmp) st1 {v3.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) bgt inner_loop_wd_16 end_inner_loop_wd_16: subs x11,x11,#4 //ht -= 4 sub x0,x9,x15 //pu1_src = pu1_src_tmp sub x1,x10,x15 //pu1_dst = pu1_dst_tmp bgt outer_loop_wd_16 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp // MRS x20,PMCCFILTR_EL0 sub x0,x20,x19 ldp x19,x20,[sp],#16 ret