///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** ///******************************************************************************* //* //file //* ihevc_deblk_luma_vert.s //* //* //brief //* contains function definitions for inter prediction interpolation. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* //author //* anand s //* //* //par list of functions: //* //* //* //remarks //* none //* //*******************************************************************************/ .text .align 4 .extern gai4_ihevc_tc_table .extern gai4_ihevc_beta_table .globl ihevc_deblk_luma_vert_av8 .type ihevc_deblk_luma_vert_av8, %function ihevc_deblk_luma_vert_av8: sxtw x5,w5 sxtw x6,w6 stp d8,d9,[sp,#-16]! stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! mov x21,x7 ldr w22,[sp,#96] add x3,x3,x4 add x3,x3,#1 asr x3,x3,#1 add x7,x3,x5,lsl #1 add x3,x3,x6,lsl #1 cmp x7,#0x33 mov x20,#0x33 csel x7, x20, x7,gt bgt l1.56 cmp x7,#0x0 mov x20,#0x0 csel x7, x20, x7,lt // x7 has the beta_index value l1.56: // bic x2,x2,#1 asr x2,x2,#1 add x3,x3,x2,lsl #1 cmp x3,#0x35 mov x20,#0x35 csel x3, x20, x3,gt bgt l1.88 cmp x3,#0x0 mov x20,#0x0 csel x3, x20, x3,lt // x3 has the tc_index value // qp_luma = (quant_param_p + quant_param_q + 1) >> 1// // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)// // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)// l1.88: adrp x2, :got:gai4_ihevc_beta_table ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table] movi v18.8b, #0x2 adrp x4, :got:gai4_ihevc_tc_table ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table] ldr w5,[x2,x7,lsl #2] // beta movi v16.8h, #0x2 ldr w6,[x4,x3,lsl #2] // tc lsl x8,x6,#1 cmp x6,#0 dup v19.8b,w8 sub x7,x0,#4 movi v23.8b, #0x3 beq l1.964 sub x19,x0,#3 ld1 {v15.8b},[x7],x1 ldrb w8,[x19] // -3 value ld1 {v1.8b},[x7],x1 ldrb w10,[x19,#1] //-2 value ld1 {v29.8b},[x7],x1 ldrb w11,[x19,#2] //-1 value ld1 {v0.8b},[x7] ldrb w12,[x0,#0] // 0 value ldrb w9,[x0,#1] // 1 value trn1 v24.8b,v15.8b,v1.8b trn2 v1.8b,v15.8b,v1.8b ldrb w2,[x0,#2] // 2 value trn1 v2.8b,v29.8b,v0.8b trn2 v0.8b,v29.8b,v0.8b add x12,x12,x2 subs x9,x12,x9,lsl #1 // dq0 value is stored in x9 csneg x9,x9,x9,pl //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )// mov v29.8b,v24.8b trn1 v24.4h,v29.4h,v2.4h trn2 v2.4h,v29.4h,v2.4h add x8,x8,x11 mov v15.8b,v1.8b trn1 v1.4h,v15.4h,v0.4h trn2 v0.4h,v15.4h,v0.4h subs x8,x8,x10,lsl #1 csneg x8,x8,x8,pl // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )// add x14,x1,x1,lsl #1 add x14,x0,x14 sub x19,x14,#3 dup v4.2s, v24.s[1] ldrb w2,[x19] // -2 value dup v7.2s, v2.s[1] ldrb w10,[x19,#1] // -2 value dup v3.2s, v2.s[0] ldrb w11,[x19,#2] // -1 value dup v5.2s, v1.s[1] ldrb w12,[x14,#0] // 0 value dup v6.2s, v1.s[0] ldrb w3,[x14,#1] // 1 value dup v2.2s, v0.s[0] ldrb w4,[x14,#2] // 2 value add x12,x12,x4 subs x12,x12,x3,lsl #1 // dq3value is stored in x12 csneg x12,x12,x12,pl // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )// add x2,x2,x11 subs x11,x2,x10,lsl #1 csneg x11,x11,x11,pl // dp3 value is stored in x8 // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )// add x3,x8,x9 // x3 has the d0 value add x4,x11,x12 // x4 has the d3 value // d0 = dp0 + dq0// // d3 = dp3 + dq3// add x14,x8,x11 // x13 has the value dp add x12,x12,x9 // x12 has the value dq // dp = dp0 + dp3// // dq = dq0 + dq3// add x11, x3, x4 // x3 has the value d // d = d0 + d3// cmp x11,x5 dup v22.2s, v0.s[1] bge l1.964 // if(d < beta) // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11 // registers for use: x2,x7,x8,x9,x10, uqsub v30.8b,v7.8b,v19.8b asr x10,x5,#2 uqadd v31.8b,v7.8b,v19.8b cmp x10,x3,lsl #1 uaddl v0.8h,v5.8b,v4.8b ble l1.336 sub x19,x0,4 ldrb w2,[x19] uaddw v0.8h, v0.8h , v2.8b ldrb w7,[x19,#3] umull v20.8h, v7.8b, v23.8b ldrb w3,[x0,#0] umlal v20.8h, v22.8b, v18.8b ldrb w8,[x0,#3] // ubfx x7,x2,#24,#8 // has the -1 value // and x2,#0xff // has the -4 value // ubfx x8,x3,#24,#8 // has the 3 value // and x3,#0xff // x4 has the 0 value add v20.8h, v20.8h , v0.8h subs x8,x8,x3 rshrn v22.8b,v20.8h,#3 csneg x8,x8,x8,pl subs x2,x2,x7 umin v21.8b, v22.8b , v31.8b csneg x2,x2,x2,pl umax v22.8b, v21.8b , v30.8b add x8,x8,x2 uaddl v20.8h,v7.8b,v3.8b cmp x8,x5,asr #3 mla v20.8h, v0.8h, v16.8h bge l1.336 uaddw v0.8h, v0.8h , v7.8b subs x7,x3,x7 rshrn v20.8b,v20.8h,#3 csneg x7,x7,x7,pl rshrn v0.8b,v0.8h,#2 mov x10,#5 uqadd v30.8b,v5.8b,v19.8b mul x10, x10, x6 uqsub v31.8b,v5.8b,v19.8b add x10, x10,#1 cmp x7,x10,asr #1 bge l1.336 // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) asr x10,x5,#2 uqsub v25.8b,v4.8b,v19.8b cmp x10,x4,lsl #1 uqadd v21.8b,v4.8b,v19.8b ble l1.336 umin v26.8b, v20.8b , v21.8b add x4,x1,x1,lsl #1 add x4,x4,x0 umax v20.8b, v26.8b , v25.8b sub x19,x4,#4 ldrb w2,[x19] umin v19.8b, v0.8b , v30.8b ldrb w7,[x19,#3] umax v21.8b, v19.8b , v31.8b ldrb w3,[x4,#0] lsl x10,x6,#1 ldrb w8,[x4,#3] // ubfx x7,x2,#24,#8 // has the -1 value // and x2,#0xff // has the -4 value // ubfx x8,x3,#24,#8 // has the 3 value // and x3,#0xff // x4 has the 0 value uaddl v0.8h,v2.8b,v3.8b dup v19.8b,w10 subs x8,x8,x3 uaddw v0.8h, v0.8h , v4.8b csneg x8,x8,x8,pl uqadd v30.8b,v2.8b,v19.8b subs x2,x2,x7 uqsub v31.8b,v2.8b,v19.8b csneg x2,x2,x2,pl uaddl v26.8h,v5.8b,v6.8b add x8,x8,x2 mla v26.8h, v0.8h, v16.8h cmp x8,x5,asr #3 bge l1.336 rshrn v26.8b,v26.8h,#3 subs x7,x3,x7 uqadd v27.8b,v3.8b,v19.8b csneg x7,x7,x7,pl uqsub v28.8b,v3.8b,v19.8b mov x10,#5 umin v16.8b, v26.8b , v30.8b mul x10, x10, x6 add x10, x10,#1 cmp x7,x10,asr #1 umax v26.8b, v16.8b , v31.8b bge l1.336 uqadd v30.8b,v6.8b,v19.8b mov x2,#2 mov x4,x21 uqsub v31.8b,v6.8b,v19.8b mov x5,x22 b end_dep_deq_decision // x2 has the value of de // x6 has teh value of tc // x5 has the value of beta // x14 has the value of dp // x12 has the value of dq // x0 has the value of source address // x1 has the src stride l1.336: mov x2,#1 l1.424: mov x11,x5 mov x4,x21 mov x5,x22 cmp x6,#1 mov x20,#0 csel x9, x20, x9,eq mov x20,#0 csel x10, x20, x10,eq beq end_dep_deq_decision and x7,x4,x5 cmp x7,#1 beq both_flags_set cmp x4,#0 beq set_flag_dep_zero add x8,x11,x11,asr #1 mov x10,#0 asr x8,x8,#3 cmp x8,x14 mov x20,#1 csel x9, x20, x9,gt mov x20,#0 csel x9, x20, x9,le b end_dep_deq_decision set_flag_dep_zero: add x8,x11,x11,asr #1 mov x9,#0 asr x8,x8,#3 cmp x8,x12 mov x20,#1 csel x10, x20, x10,gt mov x20,#0 csel x10, x20, x10,le b end_dep_deq_decision both_flags_set: add x8,x11,x11,asr #1 asr x8,x8,#3 cmp x8,x14 mov x20,#1 csel x9, x20, x9,gt mov x20,#0 csel x9, x20, x9,le cmp x8,x12 mov x20,#1 csel x10, x20, x10,gt mov x20,#0 csel x10, x20, x10,le end_dep_deq_decision: //x0=source address //x1=stride // x2 =de // x4=flag p //x5= flag q //x6 =tc // x9 =dep // x10=deq // b l1.964 cmp x2,#2 // x4 has the value of de bne l1.968 cmp x5,#0 beq l1.780 // x5 has the flag of q add x3,x0,#2 st1 {v22.b}[0],[x3],x1 st1 {v22.b}[1],[x3],x1 st1 {v22.b}[2],[x3],x1 st1 {v22.b}[3],[x3] add x3,x0,x1 mov v29.8b,v20.8b trn1 v20.8b,v29.8b,v21.8b trn2 v21.8b,v29.8b,v21.8b st1 {v20.h}[0],[x0] st1 {v21.h}[0],[x3],x1 st1 {v20.h}[1],[x3],x1 st1 {v21.h}[1],[x3] l1.780: cmp x4,#0 beq l1.964 // x4 has the flag p dup v7.2s, v24.s[0] sub x3,x0,#1 uaddw v16.8h, v0.8h , v6.8b add x7,x3,x1 rshrn v2.8b,v16.8h,#2 st1 {v26.b}[0],[x3] sub x0,x0,#3 umin v16.8b, v2.8b , v27.8b st1 {v26.b}[1],[x7],x1 umull v2.8h, v6.8b, v23.8b umlal v2.8h, v7.8b, v18.8b st1 {v26.b}[2],[x7],x1 umax v5.8b, v16.8b , v28.8b st1 {v26.b}[3],[x7] add v0.8h, v2.8h , v0.8h rshrn v0.8b,v0.8h,#3 umin v1.8b, v0.8b , v30.8b umax v0.8b, v1.8b , v31.8b mov v29.8b,v0.8b trn1 v0.8b,v29.8b,v5.8b trn2 v5.8b,v29.8b,v5.8b st1 {v0.h}[0],[x0],x1 st1 {v5.h}[0],[x0],x1 st1 {v0.h}[1],[x0],x1 st1 {v5.h}[1],[x0] l1.964: ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ret l1.968: movi v0.8h, #0x9 neg x11, x6 cmp x4,#0 // checks for the flag p movi v16.8h, #0x3 movi v24.8b, #0x1 dup v30.8b,w11 and x11,x6,#0xff dup v31.8b,w11 usubl v18.8h,v4.8b,v2.8b mul v18.8h, v18.8h, v0.8h usubl v0.8h,v5.8b,v3.8b mul v16.8h, v0.8h, v16.8h sub v16.8h, v18.8h , v16.8h srshr v16.8h,v16.8h,#4 // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4// abs v0.8h, v16.8h xtn v0.8b, v0.8h // storing the absolute values of delta in d0 sqxtn v16.8b,v16.8h // storing the clipped values of delta in d16 movi v1.8b, #0xa dup v21.8b,w11 mul v1.8b, v1.8b, v21.8b // d1 stores the value (10 * tc) //if(abs(delta) < 10 * tc) smin v18.8b, v16.8b , v31.8b smax v20.8b, v18.8b , v30.8b // delta = clip3(delta, -tc, tc)// sxtl v16.8h, v20.8b uxtl v18.8h, v2.8b add v18.8h, v18.8h , v16.8h sqxtun v22.8b, v18.8h uxtl v18.8h, v4.8b sub v16.8h, v18.8h , v16.8h sqxtun v23.8b, v16.8h // tmp_p0 = clip_u8(pu1_src[-1] + delta)// // tmp_q0 = clip_u8(pu1_src[0] - delta)// beq l1.1272 cmp x9,#1 bne l1.1212 // checks for the flag dep asr x3,x6,#1 uaddl v16.8h,v6.8b,v2.8b uaddw v16.8h, v16.8h , v24.8b dup v18.8b,w3 sub x20,x3,#0 neg x3, x20 dup v19.8b,w3 ushr v16.8h,v16.8h,#1 xtn v16.8b, v16.8h usubl v16.8h,v16.8b,v3.8b saddw v16.8h, v16.8h , v20.8b sshr v16.8h,v16.8h,#1 sqxtn v16.8b,v16.8h smin v17.8b, v16.8b , v18.8b smax v16.8b, v19.8b , v17.8b uxtl v18.8h, v3.8b sxtl v16.8h, v16.8b add v16.8h, v18.8h , v16.8h sqxtun v16.8b, v16.8h mov v30.8b,v3.8b cmhs v3.8b,v0.8b,v1.8b bsl v3.8b,v30.8b,v16.8b l1.1212: dup v16.8b,w11 sub x12,x0,#3 sub x3,x0,#1 // smul v16.8b, v16.8b, v1.8b mov v29.8b,v6.8b trn1 v6.8b,v29.8b,v3.8b trn2 v3.8b,v29.8b,v3.8b st1 {v6.h}[0],[x12],x1 cmhs v16.8b,v0.8b,v1.8b st1 {v3.h}[0],[x12],x1 bsl v16.8b,v2.8b,v22.8b st1 {v16.b}[0],[x3],x1 st1 {v16.b}[1],[x3],x1 st1 {v6.h}[1],[x12],x1 st1 {v16.b}[2],[x3],x1 st1 {v3.h}[1],[x12] st1 {v16.b}[3],[x3] l1.1272: cmp x5,#0 beq l1.964 // checks for the flag q cmp x10,#1 bne l1.1412 // checks for the flag deq mov v2.8b,v7.8b asr x3,x6,#1 dup v6.8b,w3 sub x20,x3,#0 neg x3, x20 dup v16.8b,w3 uaddl v2.8h,v2.8b,v4.8b uaddw v2.8h, v2.8h , v24.8b ushr v2.8h,v2.8h,#1 xtn v2.8b, v2.8h usubl v2.8h,v2.8b,v5.8b ssubw v2.8h, v2.8h , v20.8b sshr v2.8h,v2.8h,#1 sqxtn v3.8b,v2.8h smin v2.8b, v3.8b , v6.8b smax v3.8b, v16.8b , v2.8b // dup v6.8b,w2 // smul v6.8b, v6.8b, v1.8b uxtl v16.8h, v5.8b sxtl v2.8h, v3.8b add v2.8h, v16.8h , v2.8h sqxtun v3.8b, v2.8h mov v30.8b,v5.8b cmhs v5.8b,v0.8b,v1.8b bsl v5.8b,v30.8b,v3.8b l1.1412: // dup v2.8b,w2 add x3,x0,#2 add x11,x3,x1 // smul v1.8b, v2.8b, v1.8b st1 {v7.b}[0],[x3] st1 {v7.b}[1],[x11],x1 st1 {v7.b}[2],[x11],x1 cmhs v0.8b,v0.8b,v1.8b st1 {v7.b}[3],[x11] bsl v0.8b,v4.8b,v23.8b mov v29.8b,v0.8b trn1 v0.8b,v29.8b,v5.8b trn2 v5.8b,v29.8b,v5.8b st1 {v0.h}[0],[x0],x1 st1 {v5.h}[0],[x0],x1 st1 {v0.h}[1],[x0],x1 st1 {v5.h}[1],[x0] ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ret