//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///** // ******************************************************************************* // * @file // * impeg2_idct.s // * // * @brief // * contains function definitions for single stage inverse transform // * // * @author // * anand s // * // * @par list of functions: // * - impeg2_idct_recon_dc_av8() // * // * @remarks // * none // * // ******************************************************************************* //*/ ///** // ******************************************************************************* // * // * @brief // * this function performs inverse transform and reconstruction for 8x8 // * input block // * // * @par description: // * performs inverse transform and adds the prediction data and clips output // * to 8 bit // * // * @param[in] pi2_src // * input 8x8 coefficients // * // * @param[in] pi2_tmp // * temporary 8x8 buffer for storing inverse // * // * transform // * 1st stage output // * // * @param[in] pu1_pred // * prediction 8x8 block // * // * @param[out] pu1_dst // * output 8x8 block // * // * @param[in] src_strd // * input stride // * // * @param[in] pred_strd // * prediction stride // * // * @param[in] dst_strd // * output stride // * // * @param[in] shift // * output shift // * // * @param[in] zero_cols // * zero columns in pi2_src // * // * @returns void // * // * @remarks // * none // * // ******************************************************************************* // */ //void impeg2_itrans_recon_8x8(word16 *pi2_src, // word16 *pi2_tmp, // uword8 *pu1_pred, // uword8 *pu1_dst, // word32 src_strd, // word32 pred_strd, // word32 dst_strd, // word32 zero_cols // word32 zero_rows ) //**************variables vs registers************************* // x0 => *pi2_src // x1 => *pi2_tmp // x2 => *pu1_pred // x3 => *pu1_dst // src_strd // pred_strd // dst_strd // zero_cols .text .align 4 .include "impeg2_neon_macros.s" .set idct_stg1_shift , 12 .set idct_stg2_shift , 16 .set idct_stg1_round , (1 << (idct_stg1_shift - 1)) .set idct_stg2_round , (1 << (idct_stg2_shift - 1)) .extern gai2_impeg2_idct_q15 .extern gai2_impeg2_idct_q11 .extern gai2_impeg2_idct_first_col_q15 .extern gai2_impeg2_idct_first_col_q11 .extern gai2_impeg2_mismatch_stg2_additive .global impeg2_idct_recon_dc_av8 impeg2_idct_recon_dc_av8: // STMFD sp!,{x4,x6,x12,x14} push_v_regs ////x0: pi2_src ////x1: pi2_tmp - not used, used as pred_strd ////x2: pu1_pred ////x3: pu1_dst ////x4: used as scratch ////x5: pred_strd ////x6: dst_strd ldrsh x4, [x0] adrp x14, :got:gai2_impeg2_idct_q15 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] ldrsh x12, [x14] ld1 {v0.8b}, [x2], x5 mul x4, x4, x12 ld1 {v1.8b}, [x2], x5 add x4, x4, #idct_stg1_round ld1 {v2.8b}, [x2], x5 asr x4, x4, #idct_stg1_shift adrp x14, :got:gai2_impeg2_idct_q11 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] ldrsh x12, [x14] ld1 {v3.8b}, [x2], x5 mul x4, x4, x12 ld1 {v4.8b}, [x2], x5 add x4, x4, #idct_stg2_round ld1 {v5.8b}, [x2], x5 asr x4, x4, #idct_stg2_shift ld1 {v6.8b}, [x2], x5 dup v30.8h, w4 ld1 {v7.8b}, [x2], x5 uaddw v8.8h, v30.8h , v0.8b uaddw v10.8h, v30.8h , v1.8b sqxtun v0.8b, v8.8h uaddw v12.8h, v30.8h , v2.8b sqxtun v1.8b, v10.8h st1 {v0.8b}, [x3], x6 uaddw v14.8h, v30.8h , v3.8b sqxtun v2.8b, v12.8h st1 {v1.8b}, [x3], x6 uaddw v16.8h, v30.8h , v4.8b sqxtun v3.8b, v14.8h st1 {v2.8b}, [x3], x6 uaddw v18.8h, v30.8h , v5.8b sqxtun v4.8b, v16.8h st1 {v3.8b}, [x3], x6 uaddw v20.8h, v30.8h , v6.8b sqxtun v5.8b, v18.8h st1 {v4.8b}, [x3], x6 uaddw v22.8h, v30.8h , v7.8b sqxtun v6.8b, v20.8h st1 {v5.8b}, [x3], x6 sqxtun v7.8b, v22.8h st1 {v6.8b}, [x3], x6 st1 {v7.8b}, [x3], x6 // LDMFD sp!,{x4,x6,x12,pc} pop_v_regs ret .global impeg2_idct_recon_dc_mismatch_av8 .extern gai2_impeg2_idct_last_row_q11 .extern gai2_impeg2_mismatch_stg1_outp impeg2_idct_recon_dc_mismatch_av8: // STMFD sp!,{x4-x12,x14} push_v_regs ldrsh x4, [x0] adrp x14, :got:gai2_impeg2_idct_q15 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] ldrsh x12, [x14] mul x4, x4, x12 add x4, x4, #idct_stg1_round asr x4, x4, #idct_stg1_shift adrp x14, :got:gai2_impeg2_idct_q11 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] ldrsh x12, [x14] mul x4, x4, x12 dup v0.4s, w4 mov x14, #16 ////Increment for table read adrp x4, :got:gai2_impeg2_mismatch_stg2_additive ldr x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive] ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 ld1 {v2.4h, v3.4h}, [x4], x14 ld1 {v30.8b}, [x2], x5 sxtl v8.4s, v2.4h sxtl v10.4s, v3.4h raddhn v12.4h, v0.4s, v8.4s raddhn2 v12.8h, v0.4s, v10.4s uaddw v14.8h, v12.8h , v30.8b sqxtun v30.8b, v14.8h st1 {v30.8b}, [x3], x6 // LDMFD sp!,{x4-x12,pc} pop_v_regs ret .globl impeg2_idct_recon_av8 .type impeg2_idct_recon_av8, %function impeg2_idct_recon_av8: ////register usage.extern - loading and until idct of columns //// cosine constants - d0 //// sine constants - d1 //// row 0 first half - d2 - y0 //// row 1 first half - d6 - y1 //// row 2 first half - d3 - y2 //// row 3 first half - d7 - y3 //// row 4 first half - d10 - y4 //// row 5 first half - d14 - y5 //// row 6 first half - d11 - y6 //// row 7 first half - d15 - y7 //// row 0 second half - d4 - y0 //// row 1 second half - d8 - y1 //// row 2 second half - d5 - y2 //// row 3 second half - d9 - y3 //// row 4 second half - d12 - y4 //// row 5 second half - d16 - y5 //// row 6 second half - d13 - y6 //// row 7 second half - d17 - y7 //// copy the input pointer to another register //// step 1 : load all constants // stmfd sp!,{x4-x12,x14} ldr w11, [sp] // zero rows push_v_regs stp x19, x20, [sp, #-16]! mov x12, x7 // zero columns mov x8, x5 // prediction stride mov x7, x6 // destination stride mov x6, x4 // src stride lsl x6, x6, #1 // x sizeof(word16) add x9, x0, x6, lsl #1 // 2 rows add x10, x6, x6, lsl #1 // 3 rows sub x10, x10, #8 // - 4 cols * sizeof(word16) sub x5, x6, #8 // src_strd - 4 cols * sizeof(word16) adrp x14, :got:gai2_impeg2_idct_first_col_q15 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] ld1 {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data ////step 2 load all the input data ////step 3 operate first 4 colums at a time and x11, x11, #0xff and x12, x12, #0xff cmp x11, #0xf0 bge skip_last4_rows ld1 {v2.4h}, [x0], #8 ld1 {v3.4h}, [x9], #8 ld1 {v4.4h}, [x0], x5 smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) ld1 {v5.4h}, [x9], x5 smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) ld1 {v6.4h}, [x0], #8 ld1 {v7.4h}, [x9], #8 smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) ld1 {v8.4h}, [x0], x10 smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) ld1 {v9.4h}, [x9], x10 smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) ld1 {v10.4h}, [x0], #8 smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) ld1 {v11.4h}, [x9], #8 smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) ld1 {v12.4h}, [x0], x5 smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) ld1 {v13.4h}, [x9], x5 smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) ld1 {v14.4h}, [x0], #8 smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) ld1 {v15.4h}, [x9], #8 smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) ld1 {v16.4h}, [x0], x10 smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) ld1 {v17.4h}, [x9], x10 ///* this following was activated when alignment is not there */ //// vld1.16 d2,[x0]! //// vld1.16 d3,[x2]! //// vld1.16 d4,[x0]! //// vld1.16 d5,[x2]! //// vld1.16 d6,[x0]! //// vld1.16 d7,[x2]! //// vld1.16 d8,[x0],x3 //// vld1.16 d9,[x2],x3 //// vld1.16 d10,[x0]! //// vld1.16 d11,[x2]! //// vld1.16 d12,[x0]! //// vld1.16 d13,[x2]! //// vld1.16 d14,[x0]! //// vld1.16 d15,[x2]! //// vld1.16 d16,[x0],x3 //// vld1.16 d17,[x2],x3 smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) b last4_cols skip_last4_rows: adrp x14, :got:gai2_impeg2_idct_first_col_q15 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] ld1 {v0.4h, v1.4h}, [x14] ld1 {v2.4h}, [x0], #8 ld1 {v3.4h}, [x9], #8 ld1 {v4.4h}, [x0], x5 ld1 {v5.4h}, [x9], x5 ld1 {v6.4h}, [x0], #8 ld1 {v7.4h}, [x9], #8 ld1 {v8.4h}, [x0], x10 ld1 {v9.4h}, [x9], x10 movi v12.4h, #0 movi v13.4h, #0 movi v16.4h, #0 movi v17.4h, #0 smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) last4_cols: adrp x14, :got:gai2_impeg2_idct_first_col_q15 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] ld1 {v0.4h, v1.4h}, [x14] cmp x12, #0xf0 bge skip_last4cols smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0) smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6) add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0) sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7) add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2) sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5) add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1) sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6) add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3) sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4) sqrshrn v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) sqrshrn v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) b end_skip_last4cols skip_last4cols: adrp x14, :got:gai2_impeg2_idct_first_col_q11 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] ld1 {v0.4h, v1.4h}, [x14] umov x15, v25.d[0] trn1 v25.4h, v2.4h, v6.4h trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing trn1 v27.4h, v3.4h, v7.4h trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing trn1 v6.2s, v29.2s, v31.2s trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... trn1 v2.2s, v25.2s, v27.2s trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued..... trn1 v25.4h, v10.4h, v14.4h trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing trn1 v27.4h, v11.4h, v15.4h trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing trn1 v10.2s, v25.2s, v27.2s trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued..... trn1 v14.2s, v29.2s, v31.2s trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... mov v25.d[0], x15 smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) // vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) add v2.4s, v4.4s , v24.4s sub v6.4s, v4.4s , v24.4s add v8.4s, v22.4s , v30.4s sub v24.4s, v22.4s , v30.4s sqrshrn v5.4h, v8.4s, #idct_stg2_shift sqrshrn v2.4h, v2.4s, #idct_stg2_shift sqrshrn v9.4h, v6.4s, #idct_stg2_shift sqrshrn v6.4h, v24.4s, #idct_stg2_shift sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v30.4s, v22.4s , v28.4s sub v24.4s, v22.4s , v28.4s add v28.4s, v18.4s , v26.4s sub v22.4s, v18.4s , v26.4s sqrshrn v4.4h, v30.4s, #idct_stg2_shift sqrshrn v7.4h, v24.4s, #idct_stg2_shift sqrshrn v3.4h, v28.4s, #idct_stg2_shift sqrshrn v8.4h, v22.4s, #idct_stg2_shift umov x19, v25.d[0] umov x20, v25.d[1] trn1 v27.4h, v2.4h, v3.4h trn2 v29.4h, v2.4h, v3.4h trn1 v25.4h, v4.4h, v5.4h trn2 v31.4h, v4.4h, v5.4h trn1 v2.2s, v27.2s, v25.2s trn2 v4.2s, v27.2s, v25.2s trn1 v3.2s, v29.2s, v31.2s trn2 v5.2s, v29.2s, v31.2s trn1 v27.4h, v6.4h, v7.4h trn2 v29.4h, v6.4h, v7.4h trn1 v25.4h, v8.4h, v9.4h trn2 v31.4h, v8.4h, v9.4h trn1 v6.2s, v27.2s, v25.2s trn2 v8.2s, v27.2s, v25.2s trn1 v7.2s, v29.2s, v31.2s trn2 v9.2s, v29.2s, v31.2s mov v25.d[0], x19 mov v25.d[1], x20 smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data add x5, x8, x8, lsl #1 // add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data add x10, x7, x7, lsl #1 // // swapping v3 and v6 mov v31.d[0], v3.d[0] mov v3.d[0], v6.d[0] mov v6.d[0], v31.d[0] // swapping v5 and v8 mov v31.d[0], v5.d[0] mov v5.d[0], v8.d[0] mov v8.d[0], v31.d[0] sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) add v0.4s, v12.4s , v24.4s sub v24.4s, v12.4s , v24.4s add v12.4s, v22.4s , v30.4s sub v14.4s, v22.4s , v30.4s sqrshrn v10.4h, v0.4s, #idct_stg2_shift sqrshrn v17.4h, v24.4s, #idct_stg2_shift sqrshrn v13.4h, v12.4s, #idct_stg2_shift sqrshrn v14.4h, v14.4s, #idct_stg2_shift sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v0.4s, v22.4s , v28.4s sub v24.4s, v22.4s , v28.4s add v28.4s, v18.4s , v26.4s sub v26.4s, v18.4s , v26.4s ld1 {v18.8b}, [x2], x8 sqrshrn v12.4h, v0.4s, #idct_stg2_shift ld1 {v20.8b}, [x2], x5 sqrshrn v15.4h, v24.4s, #idct_stg2_shift ld1 {v19.8b}, [x2], x8 sqrshrn v11.4h, v28.4s, #idct_stg2_shift ld1 {v22.8b}, [x4], x8 sqrshrn v16.4h, v26.4s, #idct_stg2_shift ld1 {v21.8b}, [x2], x5 b pred_buff_addition end_skip_last4cols: adrp x14, :got:gai2_impeg2_idct_first_col_q11 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] ld1 {v0.4h, v1.4h}, [x14] umov x19, v25.d[0] umov x20, v25.d[1] ///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */ trn1 v27.4h, v2.4h, v6.4h trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing trn1 v25.4h, v3.4h, v7.4h trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing trn1 v2.2s, v27.2s, v25.2s trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued..... trn1 v6.2s, v29.2s, v31.2s trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... trn1 v27.4h, v4.4h, v8.4h trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing trn1 v25.4h, v5.4h, v9.4h trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing trn1 v4.2s, v27.2s, v25.2s trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued..... trn1 v8.2s, v29.2s, v31.2s trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued..... trn1 v27.4h, v10.4h, v14.4h trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing trn1 v25.4h, v11.4h, v15.4h trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing trn1 v10.2s, v27.2s, v25.2s trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued..... trn1 v14.2s, v29.2s, v31.2s trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... trn1 v27.4h, v12.4h, v16.4h trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing trn1 v25.4h, v13.4h, v17.4h trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing trn1 v12.2s, v27.2s, v25.2s trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... trn1 v16.2s, v29.2s, v31.2s trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... mov v25.d[0], x19 mov v25.d[1], x20 ////step6 operate on first four rows and find their idct ////register usage.extern - storing and idct of rows //// cosine constants - d0 //// sine constants - d1 //// element 0 first four - d2 - y0 //// element 1 first four - d6 - y1 //// element 2 first four - d3 - y2 //// element 3 first four - d7 - y3 //// element 4 first four - d4 - y4 //// element 5 first four - d8 - y5 //// element 6 first four - d5 - y6 //// element 7 first four - d9 - y7 //// element 0 second four - d10 - y0 //// element 1 second four - d14 - y1 //// element 2 second four - d11 - y2 //// element 3 second four - d15 - y3 //// element 4 second four - d12 - y4 //// element 5 second four - d16 - y5 //// element 6 second four - d13 - y6 //// element 7 second four - d17 - y7 //// map between first kernel code seq and current //// d2 -> d2 //// d6 -> d6 //// d3 -> d3 //// d7 -> d7 //// d10 -> d4 //// d14 -> d8 //// d11 -> d5 //// d15 -> d9 //// q3 -> q3 //// q5 -> q2 //// q7 -> q4 smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) add v2.4s, v4.4s , v24.4s sub v6.4s, v4.4s , v24.4s add v8.4s, v22.4s , v30.4s sub v24.4s, v22.4s , v30.4s sqrshrn v5.4h, v8.4s, #idct_stg2_shift sqrshrn v2.4h, v2.4s, #idct_stg2_shift sqrshrn v9.4h, v6.4s, #idct_stg2_shift sqrshrn v6.4h, v24.4s, #idct_stg2_shift sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v30.4s, v22.4s , v28.4s sub v24.4s, v22.4s , v28.4s add v28.4s, v18.4s , v26.4s sub v22.4s, v18.4s , v26.4s sqrshrn v4.4h, v30.4s, #idct_stg2_shift sqrshrn v7.4h, v24.4s, #idct_stg2_shift sqrshrn v3.4h, v28.4s, #idct_stg2_shift sqrshrn v8.4h, v22.4s, #idct_stg2_shift umov x19, v25.d[0] umov x20, v25.d[1] trn1 v27.4h, v2.4h, v3.4h trn2 v29.4h, v2.4h, v3.4h trn1 v25.4h, v4.4h, v5.4h trn2 v31.4h, v4.4h, v5.4h trn1 v2.2s, v27.2s, v25.2s trn2 v4.2s, v27.2s, v25.2s trn1 v3.2s, v29.2s, v31.2s trn2 v5.2s, v29.2s, v31.2s trn1 v27.4h, v6.4h, v7.4h trn2 v29.4h, v6.4h, v7.4h trn1 v25.4h, v8.4h, v9.4h trn2 v31.4h, v8.4h, v9.4h trn1 v6.2s, v27.2s, v25.2s trn2 v8.2s, v27.2s, v25.2s trn1 v7.2s, v29.2s, v31.2s trn2 v9.2s, v29.2s, v31.2s mov v25.d[0], x19 mov v25.d[1], x20 smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) add x5, x8, x8, lsl #1 // smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) add x10, x7, x7, lsl #1 // smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) // swapping v3 and v6 mov v31.d[0], v3.d[0] mov v3.d[0], v6.d[0] mov v6.d[0], v31.d[0] smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) // swapping v5 and v8 mov v31.d[0], v5.d[0] mov v5.d[0], v8.d[0] mov v8.d[0], v31.d[0] smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) add v0.4s, v12.4s , v24.4s sub v24.4s, v12.4s , v24.4s add v12.4s, v22.4s , v30.4s sub v14.4s, v22.4s , v30.4s sqrshrn v10.4h, v0.4s, #idct_stg2_shift sqrshrn v17.4h, v24.4s, #idct_stg2_shift sqrshrn v13.4h, v12.4s, #idct_stg2_shift sqrshrn v14.4h, v14.4s, #idct_stg2_shift sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) add v0.4s, v22.4s , v28.4s sub v24.4s, v22.4s , v28.4s add v28.4s, v18.4s , v26.4s sub v26.4s, v18.4s , v26.4s ld1 {v18.8b}, [x2], x8 sqrshrn v12.4h, v0.4s, #idct_stg2_shift ld1 {v20.8b}, [x2], x5 sqrshrn v15.4h, v24.4s, #idct_stg2_shift ld1 {v19.8b}, [x2], x8 sqrshrn v11.4h, v28.4s, #idct_stg2_shift ld1 {v22.8b}, [x4], x8 sqrshrn v16.4h, v26.4s, #idct_stg2_shift ld1 {v21.8b}, [x2], x5 pred_buff_addition: umov x19, v25.d[0] umov x20, v25.d[1] trn1 v27.4h, v10.4h, v11.4h trn2 v29.4h, v10.4h, v11.4h trn1 v25.4h, v12.4h, v13.4h trn2 v31.4h, v12.4h, v13.4h trn1 v10.2s, v27.2s, v25.2s trn2 v12.2s, v27.2s, v25.2s trn1 v11.2s, v29.2s, v31.2s trn2 v13.2s, v29.2s, v31.2s trn1 v27.4h, v14.4h, v15.4h trn2 v29.4h, v14.4h, v15.4h trn1 v25.4h, v16.4h, v17.4h trn2 v31.4h, v16.4h, v17.4h trn1 v14.2s, v27.2s, v25.2s trn2 v16.2s, v27.2s, v25.2s trn1 v15.2s, v29.2s, v31.2s trn2 v17.2s, v29.2s, v31.2s mov v25.d[0], x19 mov v25.d[1], x20 ld1 {v24.8b}, [x4], x5 ld1 {v23.8b}, [x4], x8 ld1 {v25.8b}, [x4], x5 mov v2.d[1], v3.d[0] mov v4.d[1], v5.d[0] mov v6.d[1], v7.d[0] mov v8.d[1], v9.d[0] uaddw v2.8h, v2.8h , v18.8b uaddw v4.8h, v4.8h , v22.8b uaddw v6.8h, v6.8h , v20.8b uaddw v8.8h, v8.8h , v24.8b // swapping v11 and v14 mov v31.d[0], v11.d[0] mov v11.d[0], v14.d[0] mov v14.d[0], v31.d[0] // swapping v13 and v16 mov v31.d[0], v13.d[0] mov v13.d[0], v16.d[0] mov v16.d[0], v31.d[0] // row values stored in the q register. //q1 :x0 //q3: x1 //q2: x2 //q4: x3 //q5: x4 //q7: x5 //q6: x6 //q8: x7 ///// adding the prediction buffer // load prediction data //adding recon with prediction mov v10.d[1], v11.d[0] mov v12.d[1], v13.d[0] mov v14.d[1], v15.d[0] mov v16.d[1], v17.d[0] uaddw v10.8h, v10.8h , v19.8b sqxtun v2.8b, v2.8h uaddw v14.8h, v14.8h , v21.8b sqxtun v4.8b, v4.8h uaddw v12.8h, v12.8h , v23.8b sqxtun v6.8b, v6.8h uaddw v16.8h, v16.8h , v25.8b sqxtun v8.8b, v8.8h st1 {v2.8b}, [x3], x7 sqxtun v10.8b, v10.8h st1 {v6.8b}, [x3], x10 sqxtun v14.8b, v14.8h st1 {v4.8b}, [x0], x7 sqxtun v12.8b, v12.8h st1 {v8.8b}, [x0], x10 sqxtun v16.8b, v16.8h st1 {v10.8b}, [x3], x7 st1 {v14.8b}, [x3], x10 st1 {v12.8b}, [x0], x7 st1 {v16.8b}, [x0], x10 // ldmfd sp!,{x4-x12,pc} ldp x19, x20, [sp], #16 pop_v_regs ret