///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** ///** //******************************************************************************* //* //* //brief //* interprediction luma function for copy //* //* //par description: //* copies the array of width 'wd' and height 'ht' from the location pointed //* by 'src' to the location pointed by 'dst' //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[out] pu1_dst //* uword8 pointer to the destination //* //* //param[in] src_strd //* integer source stride //* //* //param[in] dst_strd //* integer destination stride //* //* //param[in] pi1_coeff //* word8 pointer to the filter coefficients //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //void ihevc_inter_pred_luma_copy_w16out ( // uword8 *pu1_src, // word16 *pi2_dst, // word32 src_strd, // word32 dst_strd, // word8 *pi1_coeff, // word32 ht, // word32 wd ) //**************variables vs registers***************************************** // x0 => *pu1_src // x1 => *pi2_dst // x2 => src_strd // x3 => dst_strd // x7 => ht // x12 => wd .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_inter_pred_luma_copy_w16out_av8 .type ihevc_inter_pred_luma_copy_w16out_av8, %function ihevc_inter_pred_luma_copy_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff mov x16,x5 // ht mov x17,x6 // wd mov x12,x17 //loads wd mov x7,x16 //loads ht cmp x7,#0 //ht condition(ht == 0) ble end_loops //loop tst x12,#7 //conditional check for wd (multiples) beq core_loop_wd_8 sub x11,x12,#4 lsl x6, x3,#1 adds x6, x6,#0 outer_loop_wd_4: subs x4,x12,#0 //wd conditional subtract ble end_inner_loop_wd_4 inner_loop_wd_4: ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) add x5,x0,x2 //pu1_src +src_strd uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) add x10,x1,x6 subs x4,x4,#4 //wd - 4 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) add x0,x0,#4 //pu1_src += 4 st1 {v0.d}[0],[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) add x1,x1,#8 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v22.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) st1 {v24.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) st1 {v26.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) bgt inner_loop_wd_4 end_inner_loop_wd_4: subs x7,x7,#4 //ht + 4 sub x0,x5,x11 sub x1,x10,x11,lsl #1 bgt outer_loop_wd_4 end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp], #16 ret core_loop_wd_8: //sub x11,x12,#8 lsl x5, x3,#1 adds x5, x5,#0 sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width neg x11, x20 sub x20,x12,x2,lsl #2 //x2->src_strd neg x8, x20 lsr x4, x12, #3 // divide by 8 mul x7, x7, x4 sub x4,x12,#0 //wd conditional check sub x7,x7,#4 //subtract one for epilog prolog: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 csel x1, x20, x1,le sub x20,x12,#0 //wd conditional check csel x4, x20, x4,le subs x7,x7,#4 //ht - 4 blt epilog_end //jumps to epilog_end beq epilog //jumps to epilog outer_loop_wd_8: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 csel x1, x20, x1,le sub x20,x12,#0 //wd conditional check csel x4, x20, x4,le subs x7,x7,#4 //ht - 4 bgt outer_loop_wd_8 epilog: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) //add x6,x0,x2 //pu1_src_tmp += src_strd shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) epilog_end: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp], #16 ret