///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** // ******************************************************************************* // * //file // * ihevc_padding_neon.s // * // * //brief // * contains function definitions padding // * // * //author // * naveen sr // * // * //par list of functions: // * - ihevc_pad_left_luma() // * - ihevc_pad_left_chroma() // * // * //remarks // * none // * // ******************************************************************************* //*/ ///** //******************************************************************************* //* //* //brief //* padding (luma block) at the left of a 2d array //* //* //par description: //* the left column of a 2d array is replicated for pad_size times at the left //* //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[in] src_strd //* integer source stride //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //param[in] pad_size //* integer -padding size of the array //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //.if pad_left_luma == c //void ihevc_pad_left_luma(uword8 *pu1_src, // word32 src_strd, // word32 ht, // word32 pad_size) //**************variables vs registers************************* // x0 => *pu1_src // x1 => src_strd // x2 => ht // x3 => pad_size .text .align 4 .globl ihevc_pad_left_luma_av8 .type ihevc_pad_left_luma_av8, %function ihevc_pad_left_luma_av8: loop_start_luma_left: // pad size is assumed to be pad_left = 80 sub x4,x0,x3 ldrb w8,[x0] add x0,x0,x1 ldrb w9,[x0] add x0,x0,x1 ldrb w10,[x0] add x0,x0,x1 ldrb w11,[x0] add x0,x0,x1 dup v0.16b,w8 dup v2.16b,w9 dup v4.16b,w10 dup v6.16b,w11 add x5,x4,x1 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4] // 16 bytes store add x6,x5,x1 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5] //128/8 = 16 bytes store add x7,x6,x1 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6] //128/8 = 16 bytes store subs x2, x2,#4 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7] //128/8 = 16 bytes store // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_luma_left ret ///** //******************************************************************************* //* //* //brief //* padding (chroma block) at the left of a 2d array //* //* //par description: //* the left column of a 2d array is replicated for pad_size times at the left //* //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[in] src_strd //* integer source stride //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array (each colour component) //* //* //param[in] pad_size //* integer -padding size of the array //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //.if pad_left_chroma == c //void ihevc_pad_left_chroma(uword8 *pu1_src, // word32 src_strd, // word32 ht, // word32 pad_size) //{ // x0 => *pu1_src // x1 => src_strd // x2 => ht // x3 => pad_size .globl ihevc_pad_left_chroma_av8 .type ihevc_pad_left_chroma_av8, %function ihevc_pad_left_chroma_av8: loop_start_chroma_left: // pad size is assumed to be pad_left = 80 sub x4,x0,x3 ldrh w8,[x0] add x0,x0,x1 ldrh w9,[x0] add x0,x0,x1 ldrh w10,[x0] add x0,x0,x1 ldrh w11,[x0] add x0,x0,x1 dup v0.8h,w8 dup v2.8h,w9 dup v4.8h,w10 dup v6.8h,w11 add x5,x4,x1 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4] // 16 bytes store add x6,x5,x1 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5] //128/8 = 16 bytes store add x7,x6,x1 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6] //128/8 = 16 bytes store subs x2, x2,#4 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7] //128/8 = 16 bytes store // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_chroma_left ret ///** //******************************************************************************* //* //* //brief //* padding (luma block) at the right of a 2d array //* //* //par description: //* the right column of a 2d array is replicated for pad_size times at the right //* //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[in] src_strd //* integer source stride //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //param[in] pad_size //* integer -padding size of the array //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //.if pad_right_luma == c //void ihevc_pad_right_luma(uword8 *pu1_src, // word32 src_strd, // word32 ht, // word32 pad_size) //{ // word32 row// // // for(row = 0// row < ht// row++) // { // memset(pu1_src, *(pu1_src -1), pad_size)// // // pu1_src += src_strd// // } //} // // x0 => *pu1_src // x1 => src_strd // x2 => ht // x3 => pad_size .globl ihevc_pad_right_luma_av8 .type ihevc_pad_right_luma_av8, %function ihevc_pad_right_luma_av8: loop_start_luma_right: // pad size is assumed to be pad_left = 80 mov x4,x0 ldrb w8,[x0, #-1] add x0,x0,x1 ldrb w9,[x0, #-1] add x0,x0,x1 ldrb w10,[x0, #-1] add x0,x0,x1 ldrb w11,[x0, #-1] add x0,x0,x1 add x5,x4,x1 add x6,x5,x1 add x7,x6,x1 dup v0.16b,w8 dup v2.16b,w9 dup v4.16b,w10 dup v6.16b,w11 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4] // 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5] //128/8 = 16 bytes store subs x2, x2,#4 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6] //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7] //128/8 = 16 bytes store // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_luma_right ret ///** //******************************************************************************* //* //* //brief ////* padding (chroma block) at the right of a 2d array //* //* //par description: //* the right column of a 2d array is replicated for pad_size times at the right //* //* //* //param[in] pu1_src ////* uword8 pointer to the source //* //* //param[in] src_strd //* integer source stride //* //* //param[in] ht ////* integer height of the array //* //* //param[in] wd //* integer width of the array (each colour component) //* //* //param[in] pad_size //* integer -padding size of the array //* //* //param[in] ht ////* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //.if pad_right_chroma == c //void ihevc_pad_right_chroma(uword8 *pu1_src, // word32 src_strd, // word32 ht, // word32 pad_size) // x0 => *pu1_src // x1 => src_strd // x2 => ht // x3 => pad_size .globl ihevc_pad_right_chroma_av8 .type ihevc_pad_right_chroma_av8, %function ihevc_pad_right_chroma_av8: loop_start_chroma_right: // pad size is assumed to be pad_left = 80 mov x4,x0 ldrh w8,[x0, #-2] add x0,x0,x1 ldrh w9,[x0, #-2] add x0,x0,x1 ldrh w10,[x0, #-2] add x0,x0,x1 ldrh w11,[x0, #-2] add x0,x0,x1 dup v0.8h,w8 dup v2.8h,w9 dup v4.8h,w10 dup v6.8h,w11 add x5,x4,x1 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4],#16 // 16 bytes store st1 {v0.16b},[x4] // 16 bytes store add x6,x5,x1 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store st1 {v2.16b},[x5] //128/8 = 16 bytes store add x7,x6,x1 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store st1 {v4.16b},[x6] //128/8 = 16 bytes store subs x2, x2,#4 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store st1 {v6.16b},[x7] //128/8 = 16 bytes store // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_chroma_right ret