/* * MIPS DSPr2 optimizations for libjpeg-turbo * * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. * All Rights Reserved. * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> * Darko Laus <darko.laus@imgtec.com> * Copyright (C) 2015, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ #include "jsimd_dspr2_asm.h" /*****************************************************************************/ LEAF_DSPR2(jsimd_c_null_convert_dspr2) /* * a0 = cinfo->image_width * a1 = input_buf * a2 = output_buf * a3 = output_row * 16(sp) = num_rows * 20(sp) = cinfo->num_components * * Null conversion for compression */ SAVE_REGS_ON_STACK 8, s0, s1 lw t9, 24(sp) // t9 = num_rows lw s0, 28(sp) // s0 = cinfo->num_components andi t0, a0, 3 // t0 = cinfo->image_width & 3 beqz t0, 4f // no residual nop 0: addiu t9, t9, -1 bltz t9, 7f li t1, 0 1: sll t3, t1, 2 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] lw t2, 0(a1) // t2 = inptr = *input_buf sll t4, a3, 2 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] addu t2, t2, t1 addu s1, t5, a0 addu t6, t5, t0 2: lbu t3, 0(t2) addiu t5, t5, 1 sb t3, -1(t5) bne t6, t5, 2b addu t2, t2, s0 3: lbu t3, 0(t2) addu t4, t2, s0 addu t7, t4, s0 addu t8, t7, s0 addu t2, t8, s0 lbu t4, 0(t4) lbu t7, 0(t7) lbu t8, 0(t8) addiu t5, t5, 4 sb t3, -4(t5) sb t4, -3(t5) sb t7, -2(t5) bne s1, t5, 3b sb t8, -1(t5) addiu t1, t1, 1 bne t1, s0, 1b nop addiu a1, a1, 4 bgez t9, 0b addiu a3, a3, 1 b 7f nop 4: addiu t9, t9, -1 bltz t9, 7f li t1, 0 5: sll t3, t1, 2 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] lw t2, 0(a1) // t2 = inptr = *input_buf sll t4, a3, 2 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] addu t2, t2, t1 addu s1, t5, a0 addu t6, t5, t0 6: lbu t3, 0(t2) addu t4, t2, s0 addu t7, t4, s0 addu t8, t7, s0 addu t2, t8, s0 lbu t4, 0(t4) lbu t7, 0(t7) lbu t8, 0(t8) addiu t5, t5, 4 sb t3, -4(t5) sb t4, -3(t5) sb t7, -2(t5) bne s1, t5, 6b sb t8, -1(t5) addiu t1, t1, 1 bne t1, s0, 5b nop addiu a1, a1, 4 bgez t9, 4b addiu a3, a3, 1 7: RESTORE_REGS_FROM_STACK 8, s0, s1 j ra nop END(jsimd_c_null_convert_dspr2) /*****************************************************************************/ /* * jsimd_extrgb_ycc_convert_dspr2 * jsimd_extbgr_ycc_convert_dspr2 * jsimd_extrgbx_ycc_convert_dspr2 * jsimd_extbgrx_ycc_convert_dspr2 * jsimd_extxbgr_ycc_convert_dspr2 * jsimd_extxrgb_ycc_convert_dspr2 * * Colorspace conversion RGB -> YCbCr */ .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ r_offs, g_offs, b_offs .macro DO_RGB_TO_YCC r, g, b, inptr lbu \r, \r_offs(\inptr) lbu \g, \g_offs(\inptr) lbu \b, \b_offs(\inptr) addiu \inptr, \pixel_size .endm LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) /* * a0 = cinfo->image_width * a1 = input_buf * a2 = output_buf * a3 = output_row * 16(sp) = num_rows */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw t7, 48(sp) // t7 = num_rows li s0, 0x4c8b // FIX(0.29900) li s1, 0x9646 // FIX(0.58700) li s2, 0x1d2f // FIX(0.11400) li s3, 0xffffd4cd // -FIX(0.16874) li s4, 0xffffab33 // -FIX(0.33126) li s5, 0x8000 // FIX(0.50000) li s6, 0xffff94d1 // -FIX(0.41869) li s7, 0xffffeb2f // -FIX(0.08131) li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 0: addiu t7, -1 // --num_rows lw t6, 0(a1) // t6 = input_buf[0] lw t0, 0(a2) lw t1, 4(a2) lw t2, 8(a2) sll t3, a3, 2 lwx t0, t3(t0) // t0 = output_buf[0][output_row] lwx t1, t3(t1) // t1 = output_buf[1][output_row] lwx t2, t3(t2) // t2 = output_buf[2][output_row] addu t9, t2, a0 // t9 = end address addiu a3, 1 1: DO_RGB_TO_YCC t3, t4, t5, t6 mtlo s5, $ac0 mtlo t8, $ac1 mtlo t8, $ac2 maddu $ac0, s2, t5 maddu $ac1, s5, t5 maddu $ac2, s5, t3 maddu $ac0, s0, t3 maddu $ac1, s3, t3 maddu $ac2, s6, t4 maddu $ac0, s1, t4 maddu $ac1, s4, t4 maddu $ac2, s7, t5 extr.w t3, $ac0, 16 extr.w t4, $ac1, 16 extr.w t5, $ac2, 16 sb t3, 0(t0) sb t4, 0(t1) sb t5, 0(t2) addiu t0, 1 addiu t2, 1 bne t2, t9, 1b addiu t1, 1 bgtz t7, 0b addiu a1, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_\colorid\()_ycc_convert_dspr2) .purgem DO_RGB_TO_YCC .endm /*-------------------------------------id -- pix R G B */ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 /*****************************************************************************/ /* * jsimd_ycc_extrgb_convert_dspr2 * jsimd_ycc_extbgr_convert_dspr2 * jsimd_ycc_extrgbx_convert_dspr2 * jsimd_ycc_extbgrx_convert_dspr2 * jsimd_ycc_extxbgr_convert_dspr2 * jsimd_ycc_extxrgb_convert_dspr2 * * Colorspace conversion YCbCr -> RGB */ .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ r_offs, g_offs, b_offs, a_offs .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr sb \scratch0, \r_offs(\outptr) sb \scratch1, \g_offs(\outptr) sb \scratch2, \b_offs(\outptr) .if (\pixel_size == 4) li t0, 0xFF sb t0, \a_offs(\outptr) .endif addiu \outptr, \pixel_size .endm LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) /* * a0 = cinfo->image_width * a1 = input_buf * a2 = input_row * a3 = output_buf * 16(sp) = num_rows */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw s1, 48(sp) li t3, 0x8000 li t4, 0x166e9 // FIX(1.40200) li t5, 0x1c5a2 // FIX(1.77200) li t6, 0xffff492e // -FIX(0.71414) li t7, 0xffffa7e6 // -FIX(0.34414) repl.ph t8, 128 0: lw s0, 0(a3) lw t0, 0(a1) lw t1, 4(a1) lw t2, 8(a1) sll s5, a2, 2 addiu s1, -1 lwx s2, s5(t0) lwx s3, s5(t1) lwx s4, s5(t2) addu t9, s2, a0 addiu a2, 1 1: lbu s7, 0(s4) // cr lbu s6, 0(s3) // cb lbu s5, 0(s2) // y addiu s2, 1 addiu s4, 1 addiu s7, -128 addiu s6, -128 mul t2, t7, s6 mul t0, t6, s7 // Crgtab[cr] sll s7, 15 mulq_rs.w t1, t4, s7 // Crrtab[cr] sll s6, 15 addu t2, t3 // Cbgtab[cb] addu t2, t0 mulq_rs.w t0, t5, s6 // Cbbtab[cb] sra t2, 16 addu t1, s5 addu t2, s5 // add y ins t2, t1, 16, 16 subu.ph t2, t2, t8 addu t0, s5 shll_s.ph t2, t2, 8 subu t0, 128 shra.ph t2, t2, 8 shll_s.w t0, t0, 24 addu.ph t2, t2, t8 // clip & store sra t0, t0, 24 sra t1, t2, 16 addiu t0, 128 STORE_YCC_TO_RGB t1, t2, t0, s0 bne s2, t9, 1b addiu s3, 1 bgtz s1, 0b addiu a3, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_ycc_\colorid\()_convert_dspr2) .purgem STORE_YCC_TO_RGB .endm /*-------------------------------------id -- pix R G B A */ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 /*****************************************************************************/ /* * jsimd_extrgb_gray_convert_dspr2 * jsimd_extbgr_gray_convert_dspr2 * jsimd_extrgbx_gray_convert_dspr2 * jsimd_extbgrx_gray_convert_dspr2 * jsimd_extxbgr_gray_convert_dspr2 * jsimd_extxrgb_gray_convert_dspr2 * * Colorspace conversion RGB -> GRAY */ .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ r_offs, g_offs, b_offs .macro DO_RGB_TO_GRAY r, g, b, inptr lbu \r, \r_offs(\inptr) lbu \g, \g_offs(\inptr) lbu \b, \b_offs(\inptr) addiu \inptr, \pixel_size .endm LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) /* * a0 = cinfo->image_width * a1 = input_buf * a2 = output_buf * a3 = output_row * 16(sp) = num_rows */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 li s0, 0x4c8b // s0 = FIX(0.29900) li s1, 0x9646 // s1 = FIX(0.58700) li s2, 0x1d2f // s2 = FIX(0.11400) li s7, 0x8000 // s7 = FIX(0.50000) lw s6, 48(sp) andi t7, a0, 3 0: addiu s6, -1 // s6 = num_rows lw t0, 0(a1) lw t1, 0(a2) sll t3, a3, 2 lwx t1, t3(t1) addiu a3, 1 addu t9, t1, a0 subu t8, t9, t7 beq t1, t8, 2f nop 1: DO_RGB_TO_GRAY t3, t4, t5, t0 DO_RGB_TO_GRAY s3, s4, s5, t0 mtlo s7, $ac0 maddu $ac0, s2, t5 maddu $ac0, s1, t4 maddu $ac0, s0, t3 mtlo s7, $ac1 maddu $ac1, s2, s5 maddu $ac1, s1, s4 maddu $ac1, s0, s3 extr.w t6, $ac0, 16 DO_RGB_TO_GRAY t3, t4, t5, t0 DO_RGB_TO_GRAY s3, s4, s5, t0 mtlo s7, $ac0 maddu $ac0, s2, t5 maddu $ac0, s1, t4 extr.w t2, $ac1, 16 maddu $ac0, s0, t3 mtlo s7, $ac1 maddu $ac1, s2, s5 maddu $ac1, s1, s4 maddu $ac1, s0, s3 extr.w t5, $ac0, 16 sb t6, 0(t1) sb t2, 1(t1) extr.w t3, $ac1, 16 addiu t1, 4 sb t5, -2(t1) sb t3, -1(t1) bne t1, t8, 1b nop 2: beqz t7, 4f nop 3: DO_RGB_TO_GRAY t3, t4, t5, t0 mtlo s7, $ac0 maddu $ac0, s2, t5 maddu $ac0, s1, t4 maddu $ac0, s0, t3 extr.w t6, $ac0, 16 sb t6, 0(t1) addiu t1, 1 bne t1, t9, 3b nop 4: bgtz s6, 0b addiu a1, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_\colorid\()_gray_convert_dspr2) .purgem DO_RGB_TO_GRAY .endm /*-------------------------------------id -- pix R G B */ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 /*****************************************************************************/ /* * jsimd_h2v2_merged_upsample_dspr2 * jsimd_h2v2_extrgb_merged_upsample_dspr2 * jsimd_h2v2_extrgbx_merged_upsample_dspr2 * jsimd_h2v2_extbgr_merged_upsample_dspr2 * jsimd_h2v2_extbgrx_merged_upsample_dspr2 * jsimd_h2v2_extxbgr_merged_upsample_dspr2 * jsimd_h2v2_extxrgb_merged_upsample_dspr2 * * Merged h2v2 upsample routines */ .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ r1_offs, g1_offs, \ b1_offs, a1_offs, \ r2_offs, g2_offs, \ b2_offs, a2_offs .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ scratch5 outptr sb \scratch0, \r1_offs(\outptr) sb \scratch1, \g1_offs(\outptr) sb \scratch2, \b1_offs(\outptr) sb \scratch3, \r2_offs(\outptr) sb \scratch4, \g2_offs(\outptr) sb \scratch5, \b2_offs(\outptr) .if (\pixel_size == 8) li \scratch0, 0xFF sb \scratch0, \a1_offs(\outptr) sb \scratch0, \a2_offs(\outptr) .endif addiu \outptr, \pixel_size .endm .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr sb \scratch0, \r1_offs(\outptr) sb \scratch1, \g1_offs(\outptr) sb \scratch2, \b1_offs(\outptr) .if (\pixel_size == 8) li t0, 0xFF sb t0, \a1_offs(\outptr) .endif .endm LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) /* * a0 = cinfo->output_width * a1 = input_buf * a2 = in_row_group_ctr * a3 = output_buf * 16(sp) = cinfo->sample_range_limit */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra lw t9, 56(sp) // cinfo->sample_range_limit lw v0, 0(a1) lw v1, 4(a1) lw t0, 8(a1) sll t1, a2, 3 addiu t2, t1, 4 sll t3, a2, 2 lw t4, 0(a3) // t4 = output_buf[0] lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] lw t7, 4(a3) // t7 = output_buf[1] li s1, 0xe6ea addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] srl t3, a0, 1 blez t3, 2f addu t0, t5, t3 // t0 = end address 1: lbu t3, 0(t5) lbu s3, 0(t6) addiu t5, t5, 1 addiu t3, t3, -128 // (cb - 128) addiu s3, s3, -128 // (cr - 128) mult $ac1, s1, t3 madd $ac1, s2, s3 sll s3, s3, 15 sll t3, t3, 15 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS extr_r.w s5, $ac1, 16 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS lbu v0, 0(t1) addiu t6, t6, 1 addiu t1, t1, 2 addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu AT, 0(t3) lbu s7, 0(s3) lbu ra, 0(v1) lbu v0, -1(t1) addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) lbu v0, 0(t2) STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu AT, 0(t3) lbu s7, 0(s3) lbu ra, 0(v1) lbu v0, 1(t2) addiu t2, t2, 2 addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 bne t0, t5, 1b nop 2: andi t0, a0, 1 beqz t0, 4f lbu t3, 0(t5) lbu s3, 0(t6) addiu t3, t3, -128 // (cb - 128) addiu s3, s3, -128 // (cr - 128) mult $ac1, s1, t3 madd $ac1, s2, s3 sll s3, s3, 15 sll t3, t3, 15 lbu v0, 0(t1) extr_r.w s5, $ac1, 16 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) lbu v0, 0(t2) STORE_H2V2_1_PIXEL t3, s3, v1, t4 addu t3, v0, s4 // y+cred addu s3, v0, s5 // y+cgreen addu v1, v0, s6 // y+cblue addu t3, t9, t3 // y+cred addu s3, t9, s3 // y+cgreen addu v1, t9, v1 // y+cblue lbu t3, 0(t3) lbu s3, 0(s3) lbu v1, 0(v1) STORE_H2V2_1_PIXEL t3, s3, v1, t7 4: RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra j ra nop END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) .purgem STORE_H2V2_1_PIXEL .purgem STORE_H2V2_2_PIXELS .endm /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 /*****************************************************************************/ /* * jsimd_h2v1_merged_upsample_dspr2 * jsimd_h2v1_extrgb_merged_upsample_dspr2 * jsimd_h2v1_extrgbx_merged_upsample_dspr2 * jsimd_h2v1_extbgr_merged_upsample_dspr2 * jsimd_h2v1_extbgrx_merged_upsample_dspr2 * jsimd_h2v1_extxbgr_merged_upsample_dspr2 * jsimd_h2v1_extxrgb_merged_upsample_dspr2 * * Merged h2v1 upsample routines */ .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ r1_offs, g1_offs, \ b1_offs, a1_offs, \ r2_offs, g2_offs, \ b2_offs, a2_offs .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ scratch5 outptr sb \scratch0, \r1_offs(\outptr) sb \scratch1, \g1_offs(\outptr) sb \scratch2, \b1_offs(\outptr) sb \scratch3, \r2_offs(\outptr) sb \scratch4, \g2_offs(\outptr) sb \scratch5, \b2_offs(\outptr) .if (\pixel_size == 8) li t0, 0xFF sb t0, \a1_offs(\outptr) sb t0, \a2_offs(\outptr) .endif addiu \outptr, \pixel_size .endm .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr sb \scratch0, \r1_offs(\outptr) sb \scratch1, \g1_offs(\outptr) sb \scratch2, \b1_offs(\outptr) .if (\pixel_size == 8) li t0, 0xFF sb t0, \a1_offs(\outptr) .endif .endm LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) /* * a0 = cinfo->output_width * a1 = input_buf * a2 = in_row_group_ctr * a3 = output_buf * 16(sp) = range_limit */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra li t0, 0xe6ea lw t1, 0(a1) // t1 = input_buf[0] lw t2, 4(a1) // t2 = input_buf[1] lw t3, 8(a1) // t3 = input_buf[2] lw t8, 56(sp) // t8 = range_limit addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] addiu s0, t0, 0x9916 // s0 = 0x8000 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] srl t0, a0, 1 sll t4, a2, 2 lwx s5, t4(t1) // s5 = inptr0 lwx s6, t4(t2) // s6 = inptr1 lwx s7, t4(t3) // s7 = inptr2 lw t7, 0(a3) // t7 = outptr blez t0, 2f addu t9, s6, t0 // t9 = end address 1: lbu t2, 0(s6) // t2 = cb lbu t0, 0(s7) // t0 = cr lbu t1, 0(s5) // t1 = y addiu t2, t2, -128 // t2 = cb - 128 addiu t0, t0, -128 // t0 = cr - 128 mult $ac1, s4, t2 madd $ac1, s3, t0 sll t0, t0, 15 sll t2, t2, 15 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS extr_r.w t5, $ac1, 16 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS addiu s7, s7, 1 addiu s6, s6, 1 addu t2, t1, t0 // t2 = y + cred addu t3, t1, t5 // t3 = y + cgreen addu t4, t1, t6 // t4 = y + cblue addu t2, t8, t2 addu t3, t8, t3 addu t4, t8, t4 lbu t1, 1(s5) lbu v0, 0(t2) lbu v1, 0(t3) lbu ra, 0(t4) addu t2, t1, t0 addu t3, t1, t5 addu t4, t1, t6 addu t2, t8, t2 addu t3, t8, t3 addu t4, t8, t4 lbu t2, 0(t2) lbu t3, 0(t3) lbu t4, 0(t4) STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 bne t9, s6, 1b addiu s5, s5, 2 2: andi t0, a0, 1 beqz t0, 4f nop 3: lbu t2, 0(s6) lbu t0, 0(s7) lbu t1, 0(s5) addiu t2, t2, -128 // (cb - 128) addiu t0, t0, -128 // (cr - 128) mul t3, s4, t2 mul t4, s3, t0 sll t0, t0, 15 sll t2, t2, 15 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS addu t3, t3, s0 addu t3, t4, t3 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS addu t2, t1, t0 // y + cred addu t3, t1, t5 // y + cgreen addu t4, t1, t6 // y + cblue addu t2, t8, t2 addu t3, t8, t3 addu t4, t8, t4 lbu t2, 0(t2) lbu t3, 0(t3) lbu t4, 0(t4) STORE_H2V1_1_PIXEL t2, t3, t4, t7 4: RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra j ra nop END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) .purgem STORE_H2V1_1_PIXEL .purgem STORE_H2V1_2_PIXELS .endm /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 /*****************************************************************************/ /* * jsimd_h2v2_fancy_upsample_dspr2 * * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. */ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) /* * a0 = cinfo->max_v_samp_factor * a1 = downsampled_width * a2 = input_data * a3 = output_data_ptr */ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 li s4, 0 lw s2, 0(a3) // s2 = *output_data_ptr 0: li t9, 2 lw s1, -4(a2) // s1 = inptr1 1: lw s0, 0(a2) // s0 = inptr0 lwx s3, s4(s2) addiu s5, a1, -2 // s5 = downsampled_width - 2 srl t4, s5, 1 sll t4, t4, 1 lbu t0, 0(s0) lbu t1, 1(s0) lbu t2, 0(s1) lbu t3, 1(s1) addiu s0, 2 addiu s1, 2 addu t8, s0, t4 // t8 = end address andi s5, s5, 1 // s5 = residual sll t4, t0, 1 sll t6, t1, 1 addu t0, t0, t4 // t0 = (*inptr0++) * 3 addu t1, t1, t6 // t1 = (*inptr0++) * 3 addu t7, t0, t2 // t7 = thiscolsum addu t6, t1, t3 // t5 = nextcolsum sll t0, t7, 2 // t0 = thiscolsum * 4 subu t1, t0, t7 // t1 = thiscolsum * 3 shra_r.w t0, t0, 4 addiu t1, 7 addu t1, t1, t6 srl t1, t1, 4 sb t0, 0(s3) sb t1, 1(s3) beq t8, s0, 22f // skip to final iteration if width == 3 addiu s3, 2 2: lh t0, 0(s0) // t0 = A3|A2 lh t2, 0(s1) // t2 = B3|B2 addiu s0, 2 addiu s1, 2 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 shll.ph t1, t0, 1 sll t3, t6, 1 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 addu t3, t3, t6 // t3 = this * 3 addu.ph t0, t0, t2 // t0 = next2|next1 addu t1, t3, t7 andi t7, t0, 0xFFFF // t7 = next1 sll t2, t7, 1 addu t2, t7, t2 // t2 = next1*3 addu t4, t2, t6 srl t6, t0, 16 // t6 = next2 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 addu t0, t3, t7 addiu t0, 7 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 addu t2, t2, t6 addiu t2, 7 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 sb t1, 0(s3) sb t0, 1(s3) sb t4, 2(s3) sb t2, 3(s3) bne t8, s0, 2b addiu s3, 4 22: beqz s5, 4f addu t8, s0, s5 3: lbu t0, 0(s0) lbu t2, 0(s1) addiu s0, 1 addiu s1, 1 sll t3, t6, 1 sll t1, t0, 1 addu t1, t0, t1 // t1 = inptr0 * 3 addu t3, t3, t6 // t3 = thiscolsum * 3 addu t5, t1, t2 addu t1, t3, t7 shra_r.w t1, t1, 4 addu t0, t3, t5 addiu t0, 7 srl t0, t0, 4 sb t1, 0(s3) sb t0, 1(s3) addiu s3, 2 move t7, t6 bne t8, s0, 3b move t6, t5 4: sll t0, t6, 2 // t0 = thiscolsum * 4 subu t1, t0, t6 // t1 = thiscolsum * 3 addu t1, t1, t7 addiu s4, 4 shra_r.w t1, t1, 4 addiu t0, 7 srl t0, t0, 4 sb t1, 0(s3) sb t0, 1(s3) addiu t9, -1 addiu s3, 2 bnez t9, 1b lw s1, 4(a2) srl t0, s4, 2 subu t0, a0, t0 bgtz t0, 0b addiu a2, 4 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 j ra nop END(jsimd_h2v2_fancy_upsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) /* * a0 = cinfo->max_v_samp_factor * a1 = downsampled_width * a2 = input_data * a3 = output_data_ptr */ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 .set at beqz a0, 3f sll t0, a0, 2 lw s1, 0(a3) li s3, 0x10001 addu s0, s1, t0 0: addiu t8, a1, -2 srl t9, t8, 2 lw t7, 0(a2) lw s2, 0(s1) lbu t0, 0(t7) lbu t1, 1(t7) // t1 = inptr[1] sll t2, t0, 1 addu t2, t2, t0 // t2 = invalue*3 addu t2, t2, t1 shra_r.w t2, t2, 2 sb t0, 0(s2) sb t2, 1(s2) beqz t9, 11f addiu s2, 2 1: ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| ulw t1, 1(t7) ulh t2, 4(t7) // t2 = |0|0|P5|P4| preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| shll.ph t5, t4, 1 shll.ph t6, t1, 1 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| addu.ph t4, t3, s3 addu.ph t0, t0, s3 addu.ph t4, t4, t5 addu.ph t0, t0, t6 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| addu.ph t2, t2, t5 addu.ph t3, t3, t6 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| shll.ph t2, t2, 8 shll.ph t3, t3, 8 or t2, t4, t2 or t3, t3, t0 addiu t9, -1 usw t3, 0(s2) usw t2, 4(s2) addiu s2, 8 bgtz t9, 1b addiu t7, 4 11: andi t8, 3 beqz t8, 22f addiu t7, 1 2: lbu t0, 0(t7) addiu t7, 1 sll t1, t0, 1 addu t2, t0, t1 // t2 = invalue lbu t3, -2(t7) lbu t4, 0(t7) addiu t3, 1 addiu t4, 2 addu t3, t3, t2 addu t4, t4, t2 srl t3, 2 srl t4, 2 sb t3, 0(s2) sb t4, 1(s2) addiu t8, -1 bgtz t8, 2b addiu s2, 2 22: lbu t0, 0(t7) lbu t2, -1(t7) sll t1, t0, 1 addu t1, t1, t0 // t1 = invalue * 3 addu t1, t1, t2 addiu t1, 1 srl t1, t1, 2 sb t1, 0(s2) sb t0, 1(s2) addiu s1, 4 bne s1, s0, 0b addiu a2, 4 3: RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 j ra nop END(jsimd_h2v1_fancy_upsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) /* * a0 = cinfo->image_width * a1 = cinfo->max_v_samp_factor * a2 = compptr->v_samp_factor * a3 = compptr->width_in_blocks * 16(sp) = input_data * 20(sp) = output_data */ .set at SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 beqz a2, 7f lw s1, 44(sp) // s1 = output_data lw s0, 40(sp) // s0 = input_data srl s2, a0, 2 andi t9, a0, 2 srl t7, t9, 1 addu s2, t7, s2 sll t0, a3, 3 // t0 = width_in_blocks*DCT srl t7, t0, 1 subu s2, t7, s2 0: andi t6, a0, 1 // t6 = temp_index addiu t6, -1 lw t4, 0(s1) // t4 = outptr lw t5, 0(s0) // t5 = inptr0 li s3, 0 // s3 = bias srl t7, a0, 1 // t7 = image_width1 srl s4, t7, 2 andi t8, t7, 3 1: ulhu t0, 0(t5) ulhu t1, 2(t5) ulhu t2, 4(t5) ulhu t3, 6(t5) raddu.w.qb t0, t0 raddu.w.qb t1, t1 raddu.w.qb t2, t2 raddu.w.qb t3, t3 shra.ph t0, t0, 1 shra_r.ph t1, t1, 1 shra.ph t2, t2, 1 shra_r.ph t3, t3, 1 sb t0, 0(t4) sb t1, 1(t4) sb t2, 2(t4) sb t3, 3(t4) addiu s4, -1 addiu t4, 4 bgtz s4, 1b addiu t5, 8 beqz t8, 3f addu s4, t4, t8 2: ulhu t0, 0(t5) raddu.w.qb t0, t0 addqh.w t0, t0, s3 xori s3, s3, 1 sb t0, 0(t4) addiu t4, 1 bne t4, s4, 2b addiu t5, 2 3: lbux t1, t6(t5) sll t1, 1 addqh.w t2, t1, s3 // t2 = pixval1 xori s3, s3, 1 addqh.w t3, t1, s3 // t3 = pixval2 blez s2, 5f append t3, t2, 8 addu t5, t4, s2 // t5 = loop_end2 4: ush t3, 0(t4) addiu s2, -1 bgtz s2, 4b addiu t4, 2 5: beqz t9, 6f nop sb t2, 0(t4) 6: addiu s1, 4 addiu a2, -1 bnez a2, 0b addiu s0, 4 7: RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 j ra nop END(jsimd_h2v1_downsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) /* * a0 = cinfo->image_width * a1 = cinfo->max_v_samp_factor * a2 = compptr->v_samp_factor * a3 = compptr->width_in_blocks * 16(sp) = input_data * 20(sp) = output_data */ .set at SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 beqz a2, 8f lw s1, 52(sp) // s1 = output_data lw s0, 48(sp) // s0 = input_data andi t6, a0, 1 // t6 = temp_index addiu t6, -1 srl t7, a0, 1 // t7 = image_width1 srl s4, t7, 2 andi t8, t7, 3 andi t9, a0, 2 srl s2, a0, 2 srl t7, t9, 1 addu s2, t7, s2 sll t0, a3, 3 // s2 = width_in_blocks*DCT srl t7, t0, 1 subu s2, t7, s2 0: lw t4, 0(s1) // t4 = outptr lw t5, 0(s0) // t5 = inptr0 lw s7, 4(s0) // s7 = inptr1 li s6, 1 // s6 = bias 2: ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| ulw t2, 4(t5) ulw t3, 4(s7) precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| raddu.w.qb t1, t7 raddu.w.qb t0, t0 shra_r.w t1, t1, 2 addiu t0, 1 srl t0, 2 precrq.ph.w t7, t2, t3 ins t2, t3, 16, 16 raddu.w.qb t7, t7 raddu.w.qb t2, t2 shra_r.w t7, t7, 2 addiu t2, 1 srl t2, 2 sb t0, 0(t4) sb t1, 1(t4) sb t2, 2(t4) sb t7, 3(t4) addiu t4, 4 addiu t5, 8 addiu s4, s4, -1 bgtz s4, 2b addiu s7, 8 beqz t8, 4f addu t8, t4, t8 3: ulhu t0, 0(t5) ulhu t1, 0(s7) ins t0, t1, 16, 16 raddu.w.qb t0, t0 addu t0, t0, s6 srl t0, 2 xori s6, s6, 3 sb t0, 0(t4) addiu t5, 2 addiu t4, 1 bne t8, t4, 3b addiu s7, 2 4: lbux t1, t6(t5) sll t1, 1 lbux t0, t6(s7) sll t0, 1 addu t1, t1, t0 addu t3, t1, s6 srl t0, t3, 2 // t2 = pixval1 xori s6, s6, 3 addu t2, t1, s6 srl t1, t2, 2 // t3 = pixval2 blez s2, 6f append t1, t0, 8 5: ush t1, 0(t4) addiu s2, -1 bgtz s2, 5b addiu t4, 2 6: beqz t9, 7f nop sb t0, 0(t4) 7: addiu s1, 4 addiu a2, -1 bnez a2, 0b addiu s0, 8 8: RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_h2v2_downsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) /* * a0 = input_data * a1 = output_data * a2 = compptr->v_samp_factor * a3 = cinfo->max_v_samp_factor * 16(sp) = cinfo->smoothing_factor * 20(sp) = compptr->width_in_blocks * 24(sp) = cinfo->image_width */ .set at SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw s7, 52(sp) // compptr->width_in_blocks lw s0, 56(sp) // cinfo->image_width lw s6, 48(sp) // cinfo->smoothing_factor sll s7, 3 // output_cols = width_in_blocks * DCTSIZE sll v0, s7, 1 subu v0, v0, s0 blez v0, 2f move v1, zero addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 0: addiu t1, a0, -4 sll t2, v1, 2 lwx t1, t2(t1) move t3, v0 addu t1, t1, s0 lbu t2, -1(t1) 1: addiu t3, t3, -1 sb t2, 0(t1) bgtz t3, 1b addiu t1, t1, 1 addiu v1, v1, 1 bne v1, t0, 0b nop 2: li v0, 80 mul v0, s6, v0 li v1, 16384 move t4, zero move t5, zero subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 sll t7, s6, 4 // t7 = tmp_smoot_f * 16 3: /* Special case for first column: pretend column -1 is same as column 0 */ sll v0, t4, 2 lwx t8, v0(a1) // outptr = output_data[outrow] sll v1, t5, 2 addiu t9, v1, 4 addiu s0, v1, -4 addiu s1, v1, 8 lwx s2, v1(a0) // inptr0 = input_data[inrow] lwx t9, t9(a0) // inptr1 = input_data[inrow+1] lwx s0, s0(a0) // above_ptr = input_data[inrow-1] lwx s1, s1(a0) // below_ptr = input_data[inrow+2] lh v0, 0(s2) lh v1, 0(t9) lh t0, 0(s0) lh t1, 0(s1) ins v0, v1, 16, 16 ins t0, t1, 16, 16 raddu.w.qb t2, v0 raddu.w.qb s3, t0 lbu v0, 0(s2) lbu v1, 2(s2) lbu t0, 0(t9) lbu t1, 2(t9) addu v0, v0, v1 mult $ac1, t2, t6 addu t0, t0, t1 lbu t2, 2(s0) addu t0, t0, v0 lbu t3, 2(s1) addu s3, t0, s3 lbu v0, 0(s0) lbu t0, 0(s1) sll s3, s3, 1 addu v0, v0, t2 addu t0, t0, t3 addu t0, t0, v0 addu s3, t0, s3 madd $ac1, s3, t7 extr_r.w v0, $ac1, 16 addiu t8, t8, 1 addiu s2, s2, 2 addiu t9, t9, 2 addiu s0, s0, 2 addiu s1, s1, 2 sb v0, -1(t8) addiu s4, s7, -2 and s4, s4, 3 addu s5, s4, t8 // end address 4: lh v0, 0(s2) lh v1, 0(t9) lh t0, 0(s0) lh t1, 0(s1) ins v0, v1, 16, 16 ins t0, t1, 16, 16 raddu.w.qb t2, v0 raddu.w.qb s3, t0 lbu v0, -1(s2) lbu v1, 2(s2) lbu t0, -1(t9) lbu t1, 2(t9) addu v0, v0, v1 mult $ac1, t2, t6 addu t0, t0, t1 lbu t2, 2(s0) addu t0, t0, v0 lbu t3, 2(s1) addu s3, t0, s3 lbu v0, -1(s0) lbu t0, -1(s1) sll s3, s3, 1 addu v0, v0, t2 addu t0, t0, t3 addu t0, t0, v0 addu s3, t0, s3 madd $ac1, s3, t7 extr_r.w t2, $ac1, 16 addiu t8, t8, 1 addiu s2, s2, 2 addiu t9, t9, 2 addiu s0, s0, 2 sb t2, -1(t8) bne s5, t8, 4b addiu s1, s1, 2 addiu s5, s7, -2 subu s5, s5, s4 addu s5, s5, t8 // end address 5: lh v0, 0(s2) lh v1, 0(t9) lh t0, 0(s0) lh t1, 0(s1) ins v0, v1, 16, 16 ins t0, t1, 16, 16 raddu.w.qb t2, v0 raddu.w.qb s3, t0 lbu v0, -1(s2) lbu v1, 2(s2) lbu t0, -1(t9) lbu t1, 2(t9) addu v0, v0, v1 mult $ac1, t2, t6 addu t0, t0, t1 lbu t2, 2(s0) addu t0, t0, v0 lbu t3, 2(s1) addu s3, t0, s3 lbu v0, -1(s0) lbu t0, -1(s1) sll s3, s3, 1 addu v0, v0, t2 addu t0, t0, t3 lh v1, 2(t9) addu t0, t0, v0 lh v0, 2(s2) addu s3, t0, s3 lh t0, 2(s0) lh t1, 2(s1) madd $ac1, s3, t7 extr_r.w t2, $ac1, 16 ins t0, t1, 16, 16 ins v0, v1, 16, 16 raddu.w.qb s3, t0 lbu v1, 4(s2) lbu t0, 1(t9) lbu t1, 4(t9) sb t2, 0(t8) raddu.w.qb t3, v0 lbu v0, 1(s2) addu t0, t0, t1 mult $ac1, t3, t6 addu v0, v0, v1 lbu t2, 4(s0) addu t0, t0, v0 lbu v0, 1(s0) addu s3, t0, s3 lbu t0, 1(s1) lbu t3, 4(s1) addu v0, v0, t2 sll s3, s3, 1 addu t0, t0, t3 lh v1, 4(t9) addu t0, t0, v0 lh v0, 4(s2) addu s3, t0, s3 lh t0, 4(s0) lh t1, 4(s1) madd $ac1, s3, t7 extr_r.w t2, $ac1, 16 ins t0, t1, 16, 16 ins v0, v1, 16, 16 raddu.w.qb s3, t0 lbu v1, 6(s2) lbu t0, 3(t9) lbu t1, 6(t9) sb t2, 1(t8) raddu.w.qb t3, v0 lbu v0, 3(s2) addu t0, t0, t1 mult $ac1, t3, t6 addu v0, v0, v1 lbu t2, 6(s0) addu t0, t0, v0 lbu v0, 3(s0) addu s3, t0, s3 lbu t0, 3(s1) lbu t3, 6(s1) addu v0, v0, t2 sll s3, s3, 1 addu t0, t0, t3 lh v1, 6(t9) addu t0, t0, v0 lh v0, 6(s2) addu s3, t0, s3 lh t0, 6(s0) lh t1, 6(s1) madd $ac1, s3, t7 extr_r.w t3, $ac1, 16 ins t0, t1, 16, 16 ins v0, v1, 16, 16 raddu.w.qb s3, t0 lbu v1, 8(s2) lbu t0, 5(t9) lbu t1, 8(t9) sb t3, 2(t8) raddu.w.qb t2, v0 lbu v0, 5(s2) addu t0, t0, t1 mult $ac1, t2, t6 addu v0, v0, v1 lbu t2, 8(s0) addu t0, t0, v0 lbu v0, 5(s0) addu s3, t0, s3 lbu t0, 5(s1) lbu t3, 8(s1) addu v0, v0, t2 sll s3, s3, 1 addu t0, t0, t3 addiu t8, t8, 4 addu t0, t0, v0 addiu s2, s2, 8 addu s3, t0, s3 addiu t9, t9, 8 madd $ac1, s3, t7 extr_r.w t1, $ac1, 16 addiu s0, s0, 8 addiu s1, s1, 8 bne s5, t8, 5b sb t1, -1(t8) /* Special case for last column */ lh v0, 0(s2) lh v1, 0(t9) lh t0, 0(s0) lh t1, 0(s1) ins v0, v1, 16, 16 ins t0, t1, 16, 16 raddu.w.qb t2, v0 raddu.w.qb s3, t0 lbu v0, -1(s2) lbu v1, 1(s2) lbu t0, -1(t9) lbu t1, 1(t9) addu v0, v0, v1 mult $ac1, t2, t6 addu t0, t0, t1 lbu t2, 1(s0) addu t0, t0, v0 lbu t3, 1(s1) addu s3, t0, s3 lbu v0, -1(s0) lbu t0, -1(s1) sll s3, s3, 1 addu v0, v0, t2 addu t0, t0, t3 addu t0, t0, v0 addu s3, t0, s3 madd $ac1, s3, t7 extr_r.w t0, $ac1, 16 addiu t5, t5, 2 sb t0, 0(t8) addiu t4, t4, 1 bne t4, a2, 3b addiu t5, t5, 2 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_h2v2_smooth_downsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_int_upsample_dspr2) /* * a0 = upsample->h_expand[compptr->component_index] * a1 = upsample->v_expand[compptr->component_index] * a2 = input_data * a3 = output_data_ptr * 16(sp) = cinfo->output_width * 20(sp) = cinfo->max_v_samp_factor */ .set at SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 lw s0, 0(a3) // s0 = output_data lw s1, 32(sp) // s1 = cinfo->output_width lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor li t6, 0 // t6 = inrow beqz s2, 10f li s3, 0 // s3 = outrow 0: addu t0, a2, t6 addu t7, s0, s3 lw t3, 0(t0) // t3 = inptr lw t8, 0(t7) // t8 = outptr beqz s1, 4f addu t5, t8, s1 // t5 = outend 1: lb t2, 0(t3) // t2 = invalue = *inptr++ addiu t3, 1 beqz a0, 3f move t0, a0 // t0 = h_expand 2: sb t2, 0(t8) addiu t0, -1 bgtz t0, 2b addiu t8, 1 3: bgt t5, t8, 1b nop 4: addiu t9, a1, -1 // t9 = v_expand - 1 blez t9, 9f nop 5: lw t3, 0(s0) lw t4, 4(s0) subu t0, s1, 0xF blez t0, 7f addu t5, t3, s1 // t5 = end address andi t7, s1, 0xF // t7 = residual subu t8, t5, t7 6: ulw t0, 0(t3) ulw t1, 4(t3) ulw t2, 8(t3) usw t0, 0(t4) ulw t0, 12(t3) usw t1, 4(t4) usw t2, 8(t4) usw t0, 12(t4) addiu t3, 16 bne t3, t8, 6b addiu t4, 16 beqz t7, 8f nop 7: lbu t0, 0(t3) sb t0, 0(t4) addiu t3, 1 bne t3, t5, 7b addiu t4, 1 8: addiu t9, -1 bgtz t9, 5b addiu s0, 8 9: addu s3, s3, a1 bne s3, s2, 0b addiu t6, 1 10: RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 j ra nop END(jsimd_int_upsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) /* * a0 = cinfo->max_v_samp_factor * a1 = cinfo->output_width * a2 = input_data * a3 = output_data_ptr */ lw t7, 0(a3) // t7 = output_data andi t8, a1, 0xf // t8 = residual sll t0, a0, 2 blez a0, 4f addu t9, t7, t0 // t9 = output_data end address 0: lw t5, 0(t7) // t5 = outptr lw t6, 0(a2) // t6 = inptr addu t3, t5, a1 // t3 = outptr + output_width (end address) subu t3, t8 // t3 = end address - residual beq t5, t3, 2f move t4, t8 1: ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| srl t1, t0, 16 // t1 = |X|X|P3|P2| ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| usw t0, 0(t5) usw t1, 4(t5) srl t0, t2, 16 // t0 = |X|X|P7|P6| ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| usw t2, 8(t5) usw t0, 12(t5) addiu t5, 16 bne t5, t3, 1b addiu t6, 8 beqz t8, 3f move t4, t8 2: lbu t1, 0(t6) sb t1, 0(t5) sb t1, 1(t5) addiu t4, -2 addiu t6, 1 bgtz t4, 2b addiu t5, 2 3: addiu t7, 4 bne t9, t7, 0b addiu a2, 4 4: j ra nop END(jsimd_h2v1_upsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) /* * a0 = cinfo->max_v_samp_factor * a1 = cinfo->output_width * a2 = input_data * a3 = output_data_ptr */ lw t7, 0(a3) blez a0, 7f andi t9, a1, 0xf // t9 = residual 0: lw t6, 0(a2) // t6 = inptr lw t5, 0(t7) // t5 = outptr addu t8, t5, a1 // t8 = outptr end address subu t8, t9 // t8 = end address - residual beq t5, t8, 2f move t4, t9 1: ulw t0, 0(t6) srl t1, t0, 16 ins t0, t0, 16, 16 ins t0, t0, 8, 16 ins t1, t1, 16, 16 ins t1, t1, 8, 16 ulw t2, 4(t6) usw t0, 0(t5) usw t1, 4(t5) srl t3, t2, 16 ins t2, t2, 16, 16 ins t2, t2, 8, 16 ins t3, t3, 16, 16 ins t3, t3, 8, 16 usw t2, 8(t5) usw t3, 12(t5) addiu t5, 16 bne t5, t8, 1b addiu t6, 8 beqz t9, 3f move t4, t9 2: lbu t0, 0(t6) sb t0, 0(t5) sb t0, 1(t5) addiu t4, -2 addiu t6, 1 bgtz t4, 2b addiu t5, 2 3: lw t6, 0(t7) // t6 = outptr[0] lw t5, 4(t7) // t5 = outptr[1] addu t4, t6, a1 // t4 = new end address beq a1, t9, 5f subu t8, t4, t9 4: ulw t0, 0(t6) ulw t1, 4(t6) ulw t2, 8(t6) usw t0, 0(t5) ulw t0, 12(t6) usw t1, 4(t5) usw t2, 8(t5) usw t0, 12(t5) addiu t6, 16 bne t6, t8, 4b addiu t5, 16 beqz t9, 6f nop 5: lbu t0, 0(t6) sb t0, 0(t5) addiu t6, 1 bne t6, t4, 5b addiu t5, 1 6: addiu t7, 8 addiu a0, -2 bgtz a0, 0b addiu a2, 4 7: j ra nop END(jsimd_h2v2_upsample_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_islow_dspr2) /* * a0 = coef_block * a1 = compptr->dcttable * a2 = output * a3 = range_limit */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 addiu sp, sp, -256 move v0, sp addiu v1, zero, 8 // v1 = DCTSIZE = 8 1: lh s4, 32(a0) // s4 = inptr[16] lh s5, 64(a0) // s5 = inptr[32] lh s6, 96(a0) // s6 = inptr[48] lh t1, 112(a0) // t1 = inptr[56] lh t7, 16(a0) // t7 = inptr[8] lh t5, 80(a0) // t5 = inptr[40] lh t3, 48(a0) // t3 = inptr[24] or s4, s4, t1 or s4, s4, t3 or s4, s4, t5 or s4, s4, t7 or s4, s4, s5 or s4, s4, s6 bnez s4, 2f addiu v1, v1, -1 lh s5, 0(a1) // quantptr[DCTSIZE*0] lh s6, 0(a0) // inptr[DCTSIZE*0] mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) sll s5, s5, 2 sw s5, 0(v0) sw s5, 32(v0) sw s5, 64(v0) sw s5, 96(v0) sw s5, 128(v0) sw s5, 160(v0) sw s5, 192(v0) b 3f sw s5, 224(v0) 2: lh t0, 112(a1) lh t2, 48(a1) lh t4, 80(a1) lh t6, 16(a1) mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) lh t4, 32(a1) lh t5, 32(a0) lh t6, 96(a1) lh t7, 96(a0) addu s0, t0, t1 // z3 = tmp0 + tmp2 addu s1, t1, t2 // z2 = tmp1 + tmp2 addu s2, t2, t3 // z4 = tmp1 + tmp3 addu s3, s0, s2 // z3 + z4 addiu t9, zero, 9633 // FIX_1_175875602 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) addu t8, t0, t3 // z1 = tmp0 + tmp3 addiu t9, zero, 2446 // FIX_0_298631336 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) addiu t9, zero, 16819 // FIX_2_053119869 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) addiu t9, zero, 25172 // FIX_3_072711026 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) addiu t9, zero, 12299 // FIX_1_501321110 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) addiu t9, zero, 16069 // FIX_1_961570560 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) addiu t9, zero, 3196 // FIX_0_390180644 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) addiu t9, zero, 7373 // FIX_0_899976223 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) addiu t9, zero, 20995 // FIX_2_562915447 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) subu s0, s3, s0 // z3 += z5 addu t0, t0, s0 // tmp0 += z3 addu t1, t1, s0 // tmp2 += z3 subu s2, s3, s2 // z4 += z5 addu t2, t2, s2 // tmp1 += z4 addu t3, t3, s2 // tmp3 += z4 subu t0, t0, t8 // tmp0 += z1 subu t1, t1, s1 // tmp2 += z2 subu t2, t2, s1 // tmp1 += z2 subu t3, t3, t8 // tmp3 += z1 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) addiu t9, zero, 6270 // FIX_0_765366865 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) lh t4, 0(a1) lh t5, 0(a0) lh t6, 64(a1) lh t7, 64(a0) mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) addiu t9, zero, 4433 // FIX_0_541196100 addu s3, s0, s1 // z2 + z3 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) addiu t9, zero, 15137 // FIX_1_847759065 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) addu t4, t5, t6 subu t5, t5, t6 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) addu s0, t4, t7 subu s1, t4, t7 addu s2, t5, t6 subu s3, t5, t6 addu t4, s0, t3 subu s0, s0, t3 addu t3, s2, t1 subu s2, s2, t1 addu t1, s3, t2 subu s3, s3, t2 addu t2, s1, t0 subu s1, s1, t0 shra_r.w t4, t4, 11 shra_r.w t3, t3, 11 shra_r.w t1, t1, 11 shra_r.w t2, t2, 11 shra_r.w s1, s1, 11 shra_r.w s3, s3, 11 shra_r.w s2, s2, 11 shra_r.w s0, s0, 11 sw t4, 0(v0) sw t3, 32(v0) sw t1, 64(v0) sw t2, 96(v0) sw s1, 128(v0) sw s3, 160(v0) sw s2, 192(v0) sw s0, 224(v0) 3: addiu a1, a1, 2 addiu a0, a0, 2 bgtz v1, 1b addiu v0, v0, 4 move v0, sp addiu v1, zero, 8 4: lw t0, 8(v0) // z2 = (JLONG)wsptr[2] lw t1, 24(v0) // z3 = (JLONG)wsptr[6] lw t2, 0(v0) // (JLONG)wsptr[0] lw t3, 16(v0) // (JLONG)wsptr[4] lw s4, 4(v0) // (JLONG)wsptr[1] lw s5, 12(v0) // (JLONG)wsptr[3] lw s6, 20(v0) // (JLONG)wsptr[5] lw s7, 28(v0) // (JLONG)wsptr[7] or s4, s4, t0 or s4, s4, t1 or s4, s4, t3 or s4, s4, s7 or s4, s4, s5 or s4, s4, s6 bnez s4, 5f addiu v1, v1, -1 shra_r.w s5, t2, 5 andi s5, s5, 0x3ff lbux s5, s5(a3) lw s1, 0(a2) replv.qb s5, s5 usw s5, 0(s1) usw s5, 4(s1) b 6f nop 5: addu t4, t0, t1 // z2 + z3 addiu t8, zero, 4433 // FIX_0_541196100 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) addiu t8, zero, 15137 // FIX_1_847759065 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) addiu t8, zero, 6270 // FIX_0_765366865 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4] subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4] sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) subu t3, t2, t1 // tmp12 = tmp1 - tmp2 addu t2, t2, t1 // tmp11 = tmp1 + tmp2 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) subu t1, t4, t5 // tmp13 = tmp0 - tmp3 addu t0, t4, t5 // tmp10 = tmp0 + tmp3 lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7] lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3] lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5] lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1] addu s0, t4, t6 // z3 = tmp0 + tmp2 addiu t8, zero, 9633 // FIX_1_175875602 addu s1, t5, t7 // z4 = tmp1 + tmp3 addu s2, s0, s1 // z3 + z4 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) addu s3, t4, t7 // z1 = tmp0 + tmp3 addu t9, t5, t6 // z2 = tmp1 + tmp2 addiu t8, zero, 16069 // FIX_1_961570560 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) addiu t8, zero, 3196 // FIX_0_390180644 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) addiu t8, zero, 2446 // FIX_0_298631336 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) addiu t8, zero, 7373 // FIX_0_899976223 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) addiu t8, zero, 16819 // FIX_2_053119869 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) addiu t8, zero, 20995 // FIX_2_562915447 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) addiu t8, zero, 25172 // FIX_3_072711026 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) addiu t8, zero, 12299 // FIX_1_501321110 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) subu s0, s2, s0 // z3 += z5 subu s1, s2, s1 // z4 += z5 addu t4, t4, s0 subu t4, t4, s3 // tmp0 addu t5, t5, s1 subu t5, t5, t9 // tmp1 addu t6, t6, s0 subu t6, t6, t9 // tmp2 addu t7, t7, s1 subu t7, t7, s3 // tmp3 addu s0, t0, t7 subu t0, t0, t7 addu t7, t2, t6 subu t2, t2, t6 addu t6, t3, t5 subu t3, t3, t5 addu t5, t1, t4 subu t1, t1, t4 shra_r.w s0, s0, 18 shra_r.w t7, t7, 18 shra_r.w t6, t6, 18 shra_r.w t5, t5, 18 shra_r.w t1, t1, 18 shra_r.w t3, t3, 18 shra_r.w t2, t2, 18 shra_r.w t0, t0, 18 andi s0, s0, 0x3ff andi t7, t7, 0x3ff andi t6, t6, 0x3ff andi t5, t5, 0x3ff andi t1, t1, 0x3ff andi t3, t3, 0x3ff andi t2, t2, 0x3ff andi t0, t0, 0x3ff lw s1, 0(a2) lbux s0, s0(a3) lbux t7, t7(a3) lbux t6, t6(a3) lbux t5, t5(a3) lbux t1, t1(a3) lbux t3, t3(a3) lbux t2, t2(a3) lbux t0, t0(a3) sb s0, 0(s1) sb t7, 1(s1) sb t6, 2(s1) sb t5, 3(s1) sb t1, 4(s1) sb t3, 5(s1) sb t2, 6(s1) sb t0, 7(s1) 6: addiu v0, v0, 32 bgtz v1, 4b addiu a2, a2, 4 addiu sp, sp, 256 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_idct_islow_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) /* * a0 = inptr * a1 = quantptr * a2 = wsptr * a3 = mips_idct_ifast_coefs */ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 addiu t9, a0, 16 // end address or AT, a3, zero 0: lw s0, 0(a1) // quantptr[DCTSIZE*0] lw t0, 0(a0) // inptr[DCTSIZE*0] lw t1, 16(a0) // inptr[DCTSIZE*1] muleq_s.w.phl v0, t0, s0 // tmp0 ... lw t2, 32(a0) // inptr[DCTSIZE*2] lw t3, 48(a0) // inptr[DCTSIZE*3] lw t4, 64(a0) // inptr[DCTSIZE*4] lw t5, 80(a0) // inptr[DCTSIZE*5] muleq_s.w.phr t0, t0, s0 // ... tmp0 ... lw t6, 96(a0) // inptr[DCTSIZE*6] lw t7, 112(a0) // inptr[DCTSIZE*7] or s4, t1, t2 or s5, t3, t4 bnez s4, 1f ins t0, v0, 16, 16 // ... tmp0 bnez s5, 1f or s6, t5, t6 or s6, s6, t7 bnez s6, 1f sw t0, 0(a2) // wsptr[DCTSIZE*0] sw t0, 16(a2) // wsptr[DCTSIZE*1] sw t0, 32(a2) // wsptr[DCTSIZE*2] sw t0, 48(a2) // wsptr[DCTSIZE*3] sw t0, 64(a2) // wsptr[DCTSIZE*4] sw t0, 80(a2) // wsptr[DCTSIZE*5] sw t0, 96(a2) // wsptr[DCTSIZE*6] sw t0, 112(a2) // wsptr[DCTSIZE*7] addiu a0, a0, 4 b 2f addiu a1, a1, 4 1: lw s1, 32(a1) // quantptr[DCTSIZE*2] lw s2, 64(a1) // quantptr[DCTSIZE*4] muleq_s.w.phl v0, t2, s1 // tmp1 ... muleq_s.w.phr t2, t2, s1 // ... tmp1 ... lw s0, 16(a1) // quantptr[DCTSIZE*1] lw s1, 48(a1) // quantptr[DCTSIZE*3] lw s3, 96(a1) // quantptr[DCTSIZE*6] muleq_s.w.phl v1, t4, s2 // tmp2 ... muleq_s.w.phr t4, t4, s2 // ... tmp2 ... lw s2, 80(a1) // quantptr[DCTSIZE*5] lw t8, 4(AT) // FIX(1.414213562) ins t2, v0, 16, 16 // ... tmp1 muleq_s.w.phl v0, t6, s3 // tmp3 ... muleq_s.w.phr t6, t6, s3 // ... tmp3 ... ins t4, v1, 16, 16 // ... tmp2 addq.ph s4, t0, t4 // tmp10 subq.ph s5, t0, t4 // tmp11 ins t6, v0, 16, 16 // ... tmp3 subq.ph s6, t2, t6 // tmp12 ... addq.ph s7, t2, t6 // tmp13 mulq_s.ph s6, s6, t8 // ... tmp12 ... addq.ph t0, s4, s7 // tmp0 subq.ph t6, s4, s7 // tmp3 muleq_s.w.phl v0, t1, s0 // tmp4 ... muleq_s.w.phr t1, t1, s0 // ... tmp4 ... shll_s.ph s6, s6, 1 // x2 lw s3, 112(a1) // quantptr[DCTSIZE*7] subq.ph s6, s6, s7 // ... tmp12 muleq_s.w.phl v1, t7, s3 // tmp7 ... muleq_s.w.phr t7, t7, s3 // ... tmp7 ... ins t1, v0, 16, 16 // ... tmp4 addq.ph t2, s5, s6 // tmp1 subq.ph t4, s5, s6 // tmp2 muleq_s.w.phl v0, t5, s2 // tmp6 ... muleq_s.w.phr t5, t5, s2 // ... tmp6 ... ins t7, v1, 16, 16 // ... tmp7 addq.ph s5, t1, t7 // z11 subq.ph s6, t1, t7 // z12 muleq_s.w.phl v1, t3, s1 // tmp5 ... muleq_s.w.phr t3, t3, s1 // ... tmp5 ... ins t5, v0, 16, 16 // ... tmp6 ins t3, v1, 16, 16 // ... tmp5 addq.ph s7, t5, t3 // z13 subq.ph v0, t5, t3 // z10 addq.ph t7, s5, s7 // tmp7 subq.ph s5, s5, s7 // tmp11 ... addq.ph v1, v0, s6 // z5 ... mulq_s.ph s5, s5, t8 // ... tmp11 lw t8, 8(AT) // FIX(1.847759065) lw s4, 0(AT) // FIX(1.082392200) addq.ph s0, t0, t7 subq.ph s1, t0, t7 mulq_s.ph v1, v1, t8 // ... z5 shll_s.ph s5, s5, 1 // x2 lw t8, 12(AT) // FIX(-2.613125930) sw s0, 0(a2) // wsptr[DCTSIZE*0] shll_s.ph v0, v0, 1 // x4 mulq_s.ph v0, v0, t8 // tmp12 ... mulq_s.ph s4, s6, s4 // tmp10 ... shll_s.ph v1, v1, 1 // x2 addiu a0, a0, 4 addiu a1, a1, 4 sw s1, 112(a2) // wsptr[DCTSIZE*7] shll_s.ph s6, v0, 1 // x4 shll_s.ph s4, s4, 1 // x2 addq.ph s6, s6, v1 // ... tmp12 subq.ph t5, s6, t7 // tmp6 subq.ph s4, s4, v1 // ... tmp10 subq.ph t3, s5, t5 // tmp5 addq.ph s2, t2, t5 addq.ph t1, s4, t3 // tmp4 subq.ph s3, t2, t5 sw s2, 16(a2) // wsptr[DCTSIZE*1] sw s3, 96(a2) // wsptr[DCTSIZE*6] addq.ph v0, t4, t3 subq.ph v1, t4, t3 sw v0, 32(a2) // wsptr[DCTSIZE*2] sw v1, 80(a2) // wsptr[DCTSIZE*5] addq.ph v0, t6, t1 subq.ph v1, t6, t1 sw v0, 64(a2) // wsptr[DCTSIZE*4] sw v1, 48(a2) // wsptr[DCTSIZE*3] 2: bne a0, t9, 0b addiu a2, a2, 4 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_idct_ifast_cols_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) /* * a0 = wsptr * a1 = output_buf * a2 = output_col * a3 = mips_idct_ifast_coefs */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 addiu t9, a0, 128 // end address lui s8, 0x8080 ori s8, s8, 0x8080 0: lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G precrq.ph.w t1, s0, t0 // B b ins t0, s0, 16, 16 // A a bnez t1, 1f or s0, t2, s2 bnez s0, 1f or s0, t4, s4 bnez s0, 1f or s0, t6, s6 bnez s0, 1f shll_s.ph s0, t0, 2 // A a lw a3, 0(a1) lw AT, 4(a1) precrq.ph.w t0, s0, s0 // A A ins s0, s0, 16, 16 // a a addu a3, a3, a2 addu AT, AT, a2 precrq.qb.ph t0, t0, t0 // A A A A precrq.qb.ph s0, s0, s0 // a a a a addu.qb s0, s0, s8 addu.qb t0, t0, s8 sw s0, 0(a3) sw s0, 4(a3) sw t0, 0(AT) sw t0, 4(AT) addiu a0, a0, 32 bne a0, t9, 0b addiu a1, a1, 8 b 2f nop 1: precrq.ph.w t3, s2, t2 ins t2, s2, 16, 16 precrq.ph.w t5, s4, t4 ins t4, s4, 16, 16 precrq.ph.w t7, s6, t6 ins t6, s6, 16, 16 lw t8, 4(AT) // FIX(1.414213562) addq.ph s4, t0, t4 // tmp10 subq.ph s5, t0, t4 // tmp11 subq.ph s6, t2, t6 // tmp12 ... addq.ph s7, t2, t6 // tmp13 mulq_s.ph s6, s6, t8 // ... tmp12 ... addq.ph t0, s4, s7 // tmp0 subq.ph t6, s4, s7 // tmp3 shll_s.ph s6, s6, 1 // x2 subq.ph s6, s6, s7 // ... tmp12 addq.ph t2, s5, s6 // tmp1 subq.ph t4, s5, s6 // tmp2 addq.ph s5, t1, t7 // z11 subq.ph s6, t1, t7 // z12 addq.ph s7, t5, t3 // z13 subq.ph v0, t5, t3 // z10 addq.ph t7, s5, s7 // tmp7 subq.ph s5, s5, s7 // tmp11 ... addq.ph v1, v0, s6 // z5 ... mulq_s.ph s5, s5, t8 // ... tmp11 lw t8, 8(AT) // FIX(1.847759065) lw s4, 0(AT) // FIX(1.082392200) addq.ph s0, t0, t7 // tmp0 + tmp7 subq.ph s7, t0, t7 // tmp0 - tmp7 mulq_s.ph v1, v1, t8 // ... z5 lw a3, 0(a1) lw t8, 12(AT) // FIX(-2.613125930) shll_s.ph s5, s5, 1 // x2 addu a3, a3, a2 shll_s.ph v0, v0, 1 // x4 mulq_s.ph v0, v0, t8 // tmp12 ... mulq_s.ph s4, s6, s4 // tmp10 ... shll_s.ph v1, v1, 1 // x2 addiu a0, a0, 32 addiu a1, a1, 8 shll_s.ph s6, v0, 1 // x4 shll_s.ph s4, s4, 1 // x2 addq.ph s6, s6, v1 // ... tmp12 shll_s.ph s0, s0, 2 subq.ph t5, s6, t7 // tmp6 subq.ph s4, s4, v1 // ... tmp10 subq.ph t3, s5, t5 // tmp5 shll_s.ph s7, s7, 2 addq.ph t1, s4, t3 // tmp4 addq.ph s1, t2, t5 // tmp1 + tmp6 subq.ph s6, t2, t5 // tmp1 - tmp6 addq.ph s2, t4, t3 // tmp2 + tmp5 subq.ph s5, t4, t3 // tmp2 - tmp5 addq.ph s4, t6, t1 // tmp3 + tmp4 subq.ph s3, t6, t1 // tmp3 - tmp4 shll_s.ph s1, s1, 2 shll_s.ph s2, s2, 2 shll_s.ph s3, s3, 2 shll_s.ph s4, s4, 2 shll_s.ph s5, s5, 2 shll_s.ph s6, s6, 2 precrq.ph.w t0, s1, s0 // B A ins s0, s1, 16, 16 // b a precrq.ph.w t2, s3, s2 // D C ins s2, s3, 16, 16 // d c precrq.ph.w t4, s5, s4 // F E ins s4, s5, 16, 16 // f e precrq.ph.w t6, s7, s6 // H G ins s6, s7, 16, 16 // h g precrq.qb.ph t0, t2, t0 // D C B A precrq.qb.ph s0, s2, s0 // d c b a precrq.qb.ph t4, t6, t4 // H G F E precrq.qb.ph s4, s6, s4 // h g f e addu.qb s0, s0, s8 addu.qb s4, s4, s8 sw s0, 0(a3) // outptr[0/1/2/3] d c b a sw s4, 4(a3) // outptr[4/5/6/7] h g f e lw a3, -4(a1) addu.qb t0, t0, s8 addu a3, a3, a2 addu.qb t4, t4, s8 sw t0, 0(a3) // outptr[0/1/2/3] D C B A bne a0, t9, 0b sw t4, 4(a3) // outptr[4/5/6/7] H G F E 2: RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 j ra nop END(jsimd_idct_ifast_rows_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_fdct_islow_dspr2) /* * a0 = data */ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 lui t0, 6437 ori t0, 2260 lui t1, 9633 ori t1, 11363 lui t2, 0xd39e ori t2, 0xe6dc lui t3, 0xf72d ori t3, 9633 lui t4, 2261 ori t4, 9633 lui t5, 0xd39e ori t5, 6437 lui t6, 9633 ori t6, 0xd39d lui t7, 0xe6dc ori t7, 2260 lui t8, 4433 ori t8, 10703 lui t9, 0xd630 ori t9, 4433 li s8, 8 move a1, a0 1: lw s0, 0(a1) // tmp0 = 1|0 lw s1, 4(a1) // tmp1 = 3|2 lw s2, 8(a1) // tmp2 = 5|4 lw s3, 12(a1) // tmp3 = 7|6 packrl.ph s1, s1, s1 // tmp1 = 2|3 packrl.ph s3, s3, s3 // tmp3 = 6|7 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 mult $0, $0 // ac0 = 0 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 mult $ac1, $0, $0 // ac1 = 0 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 mult $ac2, $0, $0 // ac2 = 0 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 mult $ac3, $0, $0 // ac3 = 0 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 sh s0, 2(a1) sh s1, 6(a1) sh s2, 10(a1) sh s3, 14(a1) mult $0, $0 // ac0 = 0 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 mult $ac1, $0, $0 // ac1 = 0 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 sra s4, s5, 16 // tmp4 = t11 addiu a1, a1, 16 addiu s8, s8, -1 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 addu s2, s5, s4 // tmp2 = t10 + t11 subu s3, s5, s4 // tmp3 = t10 - t11 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 sh s2, -16(a1) sh s3, -8(a1) sh s0, -12(a1) bgtz s8, 1b sh s1, -4(a1) li t0, 2260 li t1, 11363 li t2, 9633 li t3, 6436 li t4, 6437 li t5, 2261 li t6, 11362 li t7, 2259 li t8, 4433 li t9, 10703 li a1, 10704 li s8, 8 2: lh a2, 0(a0) // 0 lh a3, 16(a0) // 8 lh v0, 32(a0) // 16 lh v1, 48(a0) // 24 lh s4, 64(a0) // 32 lh s5, 80(a0) // 40 lh s6, 96(a0) // 48 lh s7, 112(a0) // 56 addu s2, v0, s5 // tmp2 = 16 + 40 subu s5, v0, s5 // tmp5 = 16 - 40 addu s3, v1, s4 // tmp3 = 24 + 32 subu s4, v1, s4 // tmp4 = 24 - 32 addu s0, a2, s7 // tmp0 = 0 + 56 subu s7, a2, s7 // tmp7 = 0 - 56 addu s1, a3, s6 // tmp1 = 8 + 48 subu s6, a3, s6 // tmp6 = 8 - 48 addu a2, s0, s3 // tmp10 = tmp0 + tmp3 subu v1, s0, s3 // tmp13 = tmp0 - tmp3 addu a3, s1, s2 // tmp11 = tmp1 + tmp2 subu v0, s1, s2 // tmp12 = tmp1 - tmp2 mult s7, t1 // ac0 = tmp7 * c1 madd s4, t0 // ac0 += tmp4 * c0 madd s5, t4 // ac0 += tmp5 * c4 madd s6, t2 // ac0 += tmp6 * c2 mult $ac1, s7, t2 // ac1 = tmp7 * c2 msub $ac1, s4, t3 // ac1 -= tmp4 * c3 msub $ac1, s5, t6 // ac1 -= tmp5 * c6 msub $ac1, s6, t7 // ac1 -= tmp6 * c7 mult $ac2, s7, t4 // ac2 = tmp7 * c4 madd $ac2, s4, t2 // ac2 += tmp4 * c2 madd $ac2, s5, t5 // ac2 += tmp5 * c5 msub $ac2, s6, t6 // ac2 -= tmp6 * c6 mult $ac3, s7, t0 // ac3 = tmp7 * c0 msub $ac3, s4, t1 // ac3 -= tmp4 * c1 madd $ac3, s5, t2 // ac3 += tmp5 * c2 msub $ac3, s6, t3 // ac3 -= tmp6 * c3 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 addiu s8, s8, -1 addu s4, a2, a3 // tmp4 = tmp10 + tmp11 subu s5, a2, a3 // tmp5 = tmp10 - tmp11 sh s0, 16(a0) sh s1, 48(a0) sh s2, 80(a0) sh s3, 112(a0) mult v0, t8 // ac0 = tmp12 * c8 madd v1, t9 // ac0 += tmp13 * c9 mult $ac1, v1, t8 // ac1 = tmp13 * c8 msub $ac1, v0, a1 // ac1 -= tmp12 * c10 addiu a0, a0, 2 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 sh s4, -2(a0) sh s5, 62(a0) sh s6, 30(a0) bgtz s8, 2b sh s7, 94(a0) RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 jr ra nop END(jsimd_fdct_islow_dspr2) /**************************************************************************/ LEAF_DSPR2(jsimd_fdct_ifast_dspr2) /* * a0 = data */ .set at SAVE_REGS_ON_STACK 8, s0, s1 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) move v0, a0 addiu v1, v0, 128 // end address 0: lw t0, 0(v0) // tmp0 = 1|0 lw t1, 4(v0) // tmp1 = 3|2 lw t2, 8(v0) // tmp2 = 5|4 lw t3, 12(v0) // tmp3 = 7|6 packrl.ph t1, t1, t1 // tmp1 = 2|3 packrl.ph t3, t3, t3 // tmp3 = 6|7 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 sra t4, t8, 16 // tmp4 = t11 mult $0, $0 // ac0 = 0 dpa.w.ph $ac0, t9, s1 mult $ac1, $0, $0 // ac1 = 0 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 mult $ac2, $0, $0 // ac2 = 0 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 mult $ac3, $0, $0 // ac3 = 0 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 precrq.ph.w t0, t5, t7 // t0 = t5|t6 addq.ph t2, t8, t4 // tmp2 = t10 + t11 subq.ph t3, t8, t4 // tmp3 = t10 - t11 extr.w t4, $ac0, 8 mult $0, $0 // ac0 = 0 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 extr.w t0, $ac1, 8 // t0 = z5 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) add t6, t1, t0 // t6 = z2 add t7, t7, t0 // t7 = z4 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 addq.ph t1, t0, t6 // t1 = z13 + z2 subq.ph t6, t0, t6 // t6 = z13 - z2 addq.ph t0, t8, t7 // t0 = z11 + z4 subq.ph t7, t8, t7 // t7 = z11 - z4 addq.ph t5, t4, t9 subq.ph t4, t9, t4 sh t2, 0(v0) sh t5, 4(v0) sh t3, 8(v0) sh t4, 12(v0) sh t1, 10(v0) sh t6, 6(v0) sh t0, 2(v0) sh t7, 14(v0) addiu v0, 16 bne v1, v0, 0b nop move v0, a0 addiu v1, v0, 16 1: lh t0, 0(v0) // 0 lh t1, 16(v0) // 8 lh t2, 32(v0) // 16 lh t3, 48(v0) // 24 lh t4, 64(v0) // 32 lh t5, 80(v0) // 40 lh t6, 96(v0) // 48 lh t7, 112(v0) // 56 add t8, t0, t7 // t8 = tmp0 sub t7, t0, t7 // t7 = tmp7 add t0, t1, t6 // t0 = tmp1 sub t1, t1, t6 // t1 = tmp6 add t6, t2, t5 // t6 = tmp2 sub t5, t2, t5 // t5 = tmp5 add t2, t3, t4 // t2 = tmp3 sub t3, t3, t4 // t3 = tmp4 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 ins t8, s0, 16, 16 // t8 = tmp12|tmp13 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 mult $0, $0 // ac0 = 0 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 add s0, t4, t2 // t8 = tmp10+tmp11 sub t4, t4, t2 // t4 = tmp10-tmp11 sh s0, 0(v0) sh t4, 64(v0) extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781) addq.ph t4, t8, t2 // t9 = tmp13 + z1 subq.ph t8, t8, t2 // t2 = tmp13 - z1 sh t4, 32(v0) sh t8, 96(v0) add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 andi t4, a1, 0xffff mul s0, t1, t4 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) ins t1, t3, 16, 16 // t1 = tmp10|tmp12 mult $0, $0 // ac0 = 0 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433) add t2, t7, t8 // t2 = tmp7 + z5 sub t7, t7, t8 // t7 = tmp7 - z5 andi t4, a2, 0xffff mul t8, t3, t4 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) andi t4, s1, 0xffff mul t6, t0, t4 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) add t0, t6, t8 // t0 = z3 + z2 sub t1, t6, t8 // t1 = z3 - z2 add t3, t6, s0 // t3 = z3 + z4 sub t4, t6, s0 // t4 = z3 - z4 sub t5, t2, t1 // t5 = dataptr[5] sub t6, t7, t0 // t6 = dataptr[3] add t3, t2, t3 // t3 = dataptr[1] add t4, t7, t4 // t4 = dataptr[7] sh t5, 80(v0) sh t6, 48(v0) sh t3, 16(v0) sh t4, 112(v0) addiu v0, 2 bne v0, v1, 1b nop RESTORE_REGS_FROM_STACK 8, s0, s1 j ra nop END(jsimd_fdct_ifast_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_quantize_dspr2) /* * a0 = coef_block * a1 = divisors * a2 = workspace */ .set at SAVE_REGS_ON_STACK 16, s0, s1, s2 addiu v0, a2, 124 // v0 = workspace_end lh t0, 0(a2) lh t1, 0(a1) lh t2, 128(a1) sra t3, t0, 15 sll t3, t3, 1 addiu t3, t3, 1 mul t0, t0, t3 lh t4, 384(a1) lh t5, 130(a1) lh t6, 2(a2) lh t7, 2(a1) lh t8, 386(a1) 1: andi t1, 0xffff add t9, t0, t2 andi t9, 0xffff mul v1, t9, t1 sra s0, t6, 15 sll s0, s0, 1 addiu s0, s0, 1 addiu t9, t4, 16 srav v1, v1, t9 mul v1, v1, t3 mul t6, t6, s0 andi t7, 0xffff addiu a2, a2, 4 addiu a1, a1, 4 add s1, t6, t5 andi s1, 0xffff sh v1, 0(a0) mul s2, s1, t7 addiu s1, t8, 16 srav s2, s2, s1 mul s2, s2, s0 lh t0, 0(a2) lh t1, 0(a1) sra t3, t0, 15 sll t3, t3, 1 addiu t3, t3, 1 mul t0, t0, t3 lh t2, 128(a1) lh t4, 384(a1) lh t5, 130(a1) lh t8, 386(a1) lh t6, 2(a2) lh t7, 2(a1) sh s2, 2(a0) lh t0, 0(a2) sra t3, t0, 15 sll t3, t3, 1 addiu t3, t3, 1 mul t0, t0, t3 bne a2, v0, 1b addiu a0, a0, 4 andi t1, 0xffff add t9, t0, t2 andi t9, 0xffff mul v1, t9, t1 sra s0, t6, 15 sll s0, s0, 1 addiu s0, s0, 1 addiu t9, t4, 16 srav v1, v1, t9 mul v1, v1, t3 mul t6, t6, s0 andi t7, 0xffff sh v1, 0(a0) add s1, t6, t5 andi s1, 0xffff mul s2, s1, t7 addiu s1, t8, 16 addiu a2, a2, 4 addiu a1, a1, 4 srav s2, s2, s1 mul s2, s2, s0 sh s2, 2(a0) RESTORE_REGS_FROM_STACK 16, s0, s1, s2 j ra nop END(jsimd_quantize_dspr2) #ifndef __mips_soft_float /*****************************************************************************/ LEAF_DSPR2(jsimd_quantize_float_dspr2) /* * a0 = coef_block * a1 = divisors * a2 = workspace */ .set at li t1, 0x46800100 // integer representation 16384.5 mtc1 t1, f0 li t0, 63 0: lwc1 f2, 0(a2) lwc1 f10, 0(a1) lwc1 f4, 4(a2) lwc1 f12, 4(a1) lwc1 f6, 8(a2) lwc1 f14, 8(a1) lwc1 f8, 12(a2) lwc1 f16, 12(a1) madd.s f2, f0, f2, f10 madd.s f4, f0, f4, f12 madd.s f6, f0, f6, f14 madd.s f8, f0, f8, f16 lwc1 f10, 16(a1) lwc1 f12, 20(a1) trunc.w.s f2, f2 trunc.w.s f4, f4 trunc.w.s f6, f6 trunc.w.s f8, f8 lwc1 f14, 24(a1) lwc1 f16, 28(a1) mfc1 t1, f2 mfc1 t2, f4 mfc1 t3, f6 mfc1 t4, f8 lwc1 f2, 16(a2) lwc1 f4, 20(a2) lwc1 f6, 24(a2) lwc1 f8, 28(a2) madd.s f2, f0, f2, f10 madd.s f4, f0, f4, f12 madd.s f6, f0, f6, f14 madd.s f8, f0, f8, f16 addiu t1, t1, -16384 addiu t2, t2, -16384 addiu t3, t3, -16384 addiu t4, t4, -16384 trunc.w.s f2, f2 trunc.w.s f4, f4 trunc.w.s f6, f6 trunc.w.s f8, f8 sh t1, 0(a0) sh t2, 2(a0) sh t3, 4(a0) sh t4, 6(a0) mfc1 t1, f2 mfc1 t2, f4 mfc1 t3, f6 mfc1 t4, f8 addiu t0, t0, -8 addiu a2, a2, 32 addiu a1, a1, 32 addiu t1, t1, -16384 addiu t2, t2, -16384 addiu t3, t3, -16384 addiu t4, t4, -16384 sh t1, 8(a0) sh t2, 10(a0) sh t3, 12(a0) sh t4, 14(a0) bgez t0, 0b addiu a0, a0, 16 j ra nop END(jsimd_quantize_float_dspr2) #endif /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_2x2_dspr2) /* * a0 = compptr->dct_table * a1 = coef_block * a2 = output_buf * a3 = output_col */ .set at SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 addiu sp, sp, -40 move v0, sp addiu s2, zero, 29692 addiu s3, zero, -10426 addiu s4, zero, 6967 addiu s5, zero, -5906 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] mul t4, t5, t0 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] mul t6, t6, t1 mul t5, t5, t0 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] mul t7, t7, t2 mult zero, zero mul t8, t8, t3 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) ins t6, t5, 16, 16 // t6 = t5|t6 sll t4, t4, 15 dpa.w.ph $ac0, t6, s0 lh t1, 2(a1) lh t6, 2(a0) ins t8, t7, 16, 16 // t8 = t7|t8 dpa.w.ph $ac0, t8, s1 mflo t0, $ac0 mul t5, t6, t1 lh t1, 18(a1) lh t6, 18(a0) lh t2, 50(a1) lh t7, 50(a0) mul t6, t6, t1 subu t8, t4, t0 mul t7, t7, t2 addu t0, t4, t0 shra_r.w t0, t0, 13 lh t1, 82(a1) lh t2, 82(a0) lh t3, 114(a1) lh t4, 114(a0) shra_r.w t8, t8, 13 mul t1, t1, t2 mul t3, t3, t4 sw t0, 0(v0) sw t8, 20(v0) sll t4, t5, 15 ins t7, t6, 16, 16 mult zero, zero dpa.w.ph $ac0, t7, s0 ins t3, t1, 16, 16 lh t1, 6(a1) lh t6, 6(a0) dpa.w.ph $ac0, t3, s1 mflo t0, $ac0 mul t5, t6, t1 lh t1, 22(a1) lh t6, 22(a0) lh t2, 54(a1) lh t7, 54(a0) mul t6, t6, t1 subu t8, t4, t0 mul t7, t7, t2 addu t0, t4, t0 shra_r.w t0, t0, 13 lh t1, 86(a1) lh t2, 86(a0) lh t3, 118(a1) lh t4, 118(a0) shra_r.w t8, t8, 13 mul t1, t1, t2 mul t3, t3, t4 sw t0, 4(v0) sw t8, 24(v0) sll t4, t5, 15 ins t7, t6, 16, 16 mult zero, zero dpa.w.ph $ac0, t7, s0 ins t3, t1, 16, 16 lh t1, 10(a1) lh t6, 10(a0) dpa.w.ph $ac0, t3, s1 mflo t0, $ac0 mul t5, t6, t1 lh t1, 26(a1) lh t6, 26(a0) lh t2, 58(a1) lh t7, 58(a0) mul t6, t6, t1 subu t8, t4, t0 mul t7, t7, t2 addu t0, t4, t0 shra_r.w t0, t0, 13 lh t1, 90(a1) lh t2, 90(a0) lh t3, 122(a1) lh t4, 122(a0) shra_r.w t8, t8, 13 mul t1, t1, t2 mul t3, t3, t4 sw t0, 8(v0) sw t8, 28(v0) sll t4, t5, 15 ins t7, t6, 16, 16 mult zero, zero dpa.w.ph $ac0, t7, s0 ins t3, t1, 16, 16 lh t1, 14(a1) lh t6, 14(a0) dpa.w.ph $ac0, t3, s1 mflo t0, $ac0 mul t5, t6, t1 lh t1, 30(a1) lh t6, 30(a0) lh t2, 62(a1) lh t7, 62(a0) mul t6, t6, t1 subu t8, t4, t0 mul t7, t7, t2 addu t0, t4, t0 shra_r.w t0, t0, 13 lh t1, 94(a1) lh t2, 94(a0) lh t3, 126(a1) lh t4, 126(a0) shra_r.w t8, t8, 13 mul t1, t1, t2 mul t3, t3, t4 sw t0, 12(v0) sw t8, 32(v0) sll t4, t5, 15 ins t7, t6, 16, 16 mult zero, zero dpa.w.ph $ac0, t7, s0 ins t3, t1, 16, 16 dpa.w.ph $ac0, t3, s1 mflo t0, $ac0 lw t9, 0(a2) lw t3, 0(v0) lw t7, 4(v0) lw t1, 8(v0) addu t9, t9, a3 sll t3, t3, 15 subu t8, t4, t0 addu t0, t4, t0 shra_r.w t0, t0, 13 shra_r.w t8, t8, 13 sw t0, 16(v0) sw t8, 36(v0) lw t5, 12(v0) lw t6, 16(v0) mult t7, s2 madd t1, s3 madd t5, s4 madd t6, s5 lw t5, 24(v0) lw t7, 28(v0) mflo t0, $ac0 lw t8, 32(v0) lw t2, 36(v0) mult $ac1, t5, s2 madd $ac1, t7, s3 madd $ac1, t8, s4 madd $ac1, t2, s5 addu t1, t3, t0 subu t6, t3, t0 shra_r.w t1, t1, 20 shra_r.w t6, t6, 20 mflo t4, $ac1 shll_s.w t1, t1, 24 shll_s.w t6, t6, 24 sra t1, t1, 24 sra t6, t6, 24 addiu t1, t1, 128 addiu t6, t6, 128 lw t0, 20(v0) sb t1, 0(t9) sb t6, 1(t9) sll t0, t0, 15 lw t9, 4(a2) addu t1, t0, t4 subu t6, t0, t4 addu t9, t9, a3 shra_r.w t1, t1, 20 shra_r.w t6, t6, 20 shll_s.w t1, t1, 24 shll_s.w t6, t6, 24 sra t1, t1, 24 sra t6, t6, 24 addiu t1, t1, 128 addiu t6, t6, 128 sb t1, 0(t9) sb t6, 1(t9) addiu sp, sp, 40 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 j ra nop END(jsimd_idct_2x2_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_4x4_dspr2) /* * a0 = compptr->dct_table * a1 = coef_block * a2 = output_buf * a3 = output_col * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes */ .set at SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 lw v1, 48(sp) move t0, a1 move t1, v1 li t9, 4 li s0, 0x2e75f93e li s1, 0x21f9ba79 li s2, 0xecc2efb0 li s3, 0x52031ccd 0: lh s6, 32(t0) // inptr[DCTSIZE*2] lh t6, 32(a0) // quantptr[DCTSIZE*2] lh s7, 96(t0) // inptr[DCTSIZE*6] lh t7, 96(a0) // quantptr[DCTSIZE*6] mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) lh s4, 0(t0) // inptr[DCTSIZE*0] mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) lh s5, 0(a0) // quantptr[0] li s6, 15137 li s7, 6270 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) lh t5, 112(t0) // inptr[DCTSIZE*7] mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) lh s4, 112(a0) // quantptr[DCTSIZE*7] lh v0, 80(t0) // inptr[DCTSIZE*5] lh s5, 80(a0) // quantptr[DCTSIZE*5] lh s6, 48(a0) // quantptr[DCTSIZE*3] sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) lh s7, 16(a0) // quantptr[DCTSIZE*1] lh t8, 16(t0) // inptr[DCTSIZE*1] subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) lh t7, 48(t0) // inptr[DCTSIZE*3] mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) addu t3, t2, t6 // tmp10 = tmp0 + z2 subu t4, t2, t6 // tmp10 = tmp0 - z2 mult $ac0, zero, zero mult $ac1, zero, zero ins t5, v0, 16, 16 ins t7, t8, 16, 16 addiu t9, t9, -1 dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 mflo s4, $ac0 mflo s5, $ac1 addiu a0, a0, 2 addiu t1, t1, 4 addiu t0, t0, 2 addu t6, t4, s4 subu t5, t4, s4 addu s6, t3, s5 subu s7, t3, s5 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) sw t6, 28(t1) sw t5, 60(t1) sw s6, -4(t1) bgtz t9, 0b sw s7, 92(t1) // second loop three pass li t9, 3 1: lh s6, 34(t0) // inptr[DCTSIZE*2] lh t6, 34(a0) // quantptr[DCTSIZE*2] lh s7, 98(t0) // inptr[DCTSIZE*6] lh t7, 98(a0) // quantptr[DCTSIZE*6] mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) lh s4, 2(t0) // inptr[DCTSIZE*0] mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) lh s5, 2(a0) // quantptr[DCTSIZE*0] li s6, 15137 li s7, 6270 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) lh t5, 114(t0) // inptr[DCTSIZE*7] mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) lh s4, 114(a0) // quantptr[DCTSIZE*7] lh s5, 82(a0) // quantptr[DCTSIZE*5] lh t6, 82(t0) // inptr[DCTSIZE*5] sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) lh s6, 50(a0) // quantptr[DCTSIZE*3] lh t8, 18(t0) // inptr[DCTSIZE*1] subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) lh t7, 50(t0) // inptr[DCTSIZE*3] lh s7, 18(a0) // quantptr[DCTSIZE*1] mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) addu t3, t2, v0 // tmp10 = tmp0 + z2 subu t4, t2, v0 // tmp10 = tmp0 - z2 mult $ac0, zero, zero mult $ac1, zero, zero ins t5, t6, 16, 16 ins t7, t8, 16, 16 dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 mflo t5, $ac0 mflo t6, $ac1 addiu t9, t9, -1 addiu t0, t0, 2 addiu a0, a0, 2 addiu t1, t1, 4 addu s5, t4, t5 subu s4, t4, t5 addu s6, t3, t6 subu s7, t3, t6 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) sw s5, 32(t1) sw s4, 64(t1) sw s6, 0(t1) bgtz t9, 1b sw s7, 96(t1) move t1, v1 li s4, 15137 lw s6, 8(t1) // wsptr[2] li s5, 6270 lw s7, 24(t1) // wsptr[6] mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) lw t2, 0(t1) // wsptr[0] mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) lh t5, 28(t1) // wsptr[7] lh t6, 20(t1) // wsptr[5] lh t7, 12(t1) // wsptr[3] lh t8, 4(t1) // wsptr[1] ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) mflo s6, $ac0 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) subu s4, s4, s5 addu t3, t2, s4 // tmp10 = tmp0 + z2 mflo s7, $ac1 subu t4, t2, s4 // tmp10 = tmp0 - z2 addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) sll s4, t9, 2 lw v0, 0(a2) // output_buf[ctr] shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 shll_s.w t8, t8, 24 sra t5, t5, 24 sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 addiu t8, t8, 128 sb t5, 0(v0) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) // 2 li s4, 15137 lw s6, 40(t1) // wsptr[2] li s5, 6270 lw s7, 56(t1) // wsptr[6] mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) lw t2, 32(t1) // wsptr[0] mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) lh t5, 60(t1) // wsptr[7] lh t6, 52(t1) // wsptr[5] lh t7, 44(t1) // wsptr[3] lh t8, 36(t1) // wsptr[1] ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) mflo s6, $ac0 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) subu s4, s4, s5 addu t3, t2, s4 // tmp10 = tmp0 + z2 mflo s7, $ac1 subu t4, t2, s4 // tmp10 = tmp0 - z2 addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) sll s4, t9, 2 lw v0, 4(a2) // output_buf[ctr] shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 shll_s.w t8, t8, 24 sra t5, t5, 24 sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 addiu t8, t8, 128 sb t5, 0(v0) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) // 3 li s4, 15137 lw s6, 72(t1) // wsptr[2] li s5, 6270 lw s7, 88(t1) // wsptr[6] mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) lw t2, 64(t1) // wsptr[0] mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) lh t5, 92(t1) // wsptr[7] lh t6, 84(t1) // wsptr[5] lh t7, 76(t1) // wsptr[3] lh t8, 68(t1) // wsptr[1] ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) mflo s6, $ac0 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) subu s4, s4, s5 addu t3, t2, s4 // tmp10 = tmp0 + z2 mflo s7, $ac1 subu t4, t2, s4 // tmp10 = tmp0 - z2 addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) sll s4, t9, 2 lw v0, 8(a2) // output_buf[ctr] shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 shll_s.w t8, t8, 24 sra t5, t5, 24 sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 addiu t8, t8, 128 sb t5, 0(v0) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) li s4, 15137 lw s6, 104(t1) // wsptr[2] li s5, 6270 lw s7, 120(t1) // wsptr[6] mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) lw t2, 96(t1) // wsptr[0] mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) lh t5, 124(t1) // wsptr[7] lh t6, 116(t1) // wsptr[5] lh t7, 108(t1) // wsptr[3] lh t8, 100(t1) // wsptr[1] ins t5, t6, 16, 16 ins t7, t8, 16, 16 mult $ac0, zero, zero dpa.w.ph $ac0, t5, s0 dpa.w.ph $ac0, t7, s1 mult $ac1, zero, zero dpa.w.ph $ac1, t5, s2 dpa.w.ph $ac1, t7, s3 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) mflo s6, $ac0 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) subu s4, s4, s5 addu t3, t2, s4 // tmp10 = tmp0 + z2; mflo s7, $ac1 subu t4, t2, s4 // tmp10 = tmp0 - z2; addu t7, t4, s6 subu t8, t4, s6 addu t5, t3, s7 subu t6, t3, s7 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) sll s4, t9, 2 lw v0, 12(a2) // output_buf[ctr] shll_s.w t5, t5, 24 shll_s.w t6, t6, 24 shll_s.w t7, t7, 24 shll_s.w t8, t8, 24 sra t5, t5, 24 sra t6, t6, 24 sra t7, t7, 24 sra t8, t8, 24 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col addiu t5, t5, 128 addiu t6, t6, 128 addiu t7, t7, 128 addiu t8, t8, 128 sb t5, 0(v0) sb t7, 1(v0) sb t8, 2(v0) sb t6, 3(v0) RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_idct_4x4_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_6x6_dspr2) /* * a0 = compptr->dct_table * a1 = coef_block * a2 = output_buf * a3 = output_col */ .set at SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 addiu sp, sp, -144 move v0, sp addiu v1, v0, 24 addiu t9, zero, 5793 addiu s0, zero, 10033 addiu s1, zero, 2998 1: lh s2, 0(a0) // q0 = quantptr[ 0] lh s3, 32(a0) // q1 = quantptr[16] lh s4, 64(a0) // q2 = quantptr[32] lh t2, 64(a1) // tmp2 = inptr[32] lh t1, 32(a1) // tmp1 = inptr[16] lh t0, 0(a1) // tmp0 = inptr[ 0] mul t2, t2, s4 // tmp2 = tmp2 * q2 mul t1, t1, s3 // tmp1 = tmp1 * q1 mul t0, t0, s2 // tmp0 = tmp0 * q0 lh t6, 16(a1) // z1 = inptr[ 8] lh t8, 80(a1) // z3 = inptr[40] lh t7, 48(a1) // z2 = inptr[24] lh s2, 16(a0) // q0 = quantptr[ 8] lh s4, 80(a0) // q2 = quantptr[40] lh s3, 48(a0) // q1 = quantptr[24] mul t2, t2, t9 // tmp2 = tmp2 * 5793 mul t1, t1, s0 // tmp1 = tmp1 * 10033 sll t0, t0, 13 // tmp0 = tmp0 << 13 mul t6, t6, s2 // z1 = z1 * q0 mul t8, t8, s4 // z3 = z3 * q2 mul t7, t7, s3 // z2 = z2 * q1 addu t3, t0, t2 // tmp10 = tmp0 + tmp2 sll t2, t2, 1 // tmp2 = tmp2 << 2 subu t4, t0, t2 // tmp11 = tmp0 - tmp2; subu t5, t3, t1 // tmp12 = tmp10 - tmp1 addu t3, t3, t1 // tmp10 = tmp10 + tmp1 addu t1, t6, t8 // tmp1 = z1 + z3 mul t1, t1, s1 // tmp1 = tmp1 * 2998 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 subu t2, t6, t8 // tmp2 = z1 - z3 subu t2, t2, t7 // tmp2 = tmp2 - z2 sll t2, t2, 2 // tmp2 = tmp2 << 2 addu t0, t6, t7 // tmp0 = z1 + z2 sll t0, t0, 13 // tmp0 = tmp0 << 13 subu s2, t8, t7 // q0 = z3 - z2 sll s2, s2, 13 // q0 = q0 << 13 addu t0, t0, t1 // tmp0 = tmp0 + tmp1 addu t1, s2, t1 // tmp1 = q0 + tmp1 addu s2, t4, t2 // q0 = tmp11 + tmp2 subu s3, t4, t2 // q1 = tmp11 - tmp2 addu t6, t3, t0 // z1 = tmp10 + tmp0 subu t7, t3, t0 // z2 = tmp10 - tmp0 addu t4, t5, t1 // tmp11 = tmp12 + tmp1 subu t5, t5, t1 // tmp12 = tmp12 - tmp1 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 sw s2, 24(v0) sw s3, 96(v0) sw t6, 0(v0) sw t7, 120(v0) sw t4, 48(v0) sw t5, 72(v0) addiu v0, v0, 4 addiu a1, a1, 2 bne v0, v1, 1b addiu a0, a0, 2 /* Pass 2: process 6 rows from work array, store into output array. */ move v0, sp addiu v1, v0, 144 2: lw t0, 0(v0) lw t2, 16(v0) lw s5, 0(a2) addiu t0, t0, 16 sll t0, t0, 13 mul t3, t2, t9 lw t6, 4(v0) lw t8, 20(v0) lw t7, 12(v0) addu s5, s5, a3 addu s6, t6, t8 mul s6, s6, s1 addu t1, t0, t3 subu t4, t0, t3 subu t4, t4, t3 lw t3, 8(v0) mul t0, t3, s0 addu s7, t6, t7 sll s7, s7, 13 addu s7, s6, s7 subu t2, t8, t7 sll t2, t2, 13 addu t2, s6, t2 subu s6, t6, t7 subu s6, s6, t8 sll s6, s6, 13 addu t3, t1, t0 subu t5, t1, t0 addu t6, t3, s7 subu t3, t3, s7 addu t7, t4, s6 subu t4, t4, s6 addu t8, t5, t2 subu t5, t5, t2 shll_s.w t6, t6, 6 shll_s.w t3, t3, 6 shll_s.w t7, t7, 6 shll_s.w t4, t4, 6 shll_s.w t8, t8, 6 shll_s.w t5, t5, 6 sra t6, t6, 24 addiu t6, t6, 128 sra t3, t3, 24 addiu t3, t3, 128 sb t6, 0(s5) sra t7, t7, 24 addiu t7, t7, 128 sb t3, 5(s5) sra t4, t4, 24 addiu t4, t4, 128 sb t7, 1(s5) sra t8, t8, 24 addiu t8, t8, 128 sb t4, 4(s5) addiu v0, v0, 24 sra t5, t5, 24 addiu t5, t5, 128 sb t8, 2(s5) addiu a2, a2, 4 bne v0, v1, 2b sb t5, 3(s5) addiu sp, sp, 144 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 j ra nop END(jsimd_idct_6x6_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) /* * a0 = compptr->dct_table * a1 = coef_block * a2 = workspace */ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 li a3, 8 1: // odd part lh t0, 48(a1) lh t1, 48(a0) lh t2, 16(a1) lh t3, 16(a0) lh t4, 80(a1) lh t5, 80(a0) lh t6, 112(a1) lh t7, 112(a0) mul t0, t0, t1 // z2 mul t1, t2, t3 // z1 mul t2, t4, t5 // z3 mul t3, t6, t7 // z4 li t4, 10703 // FIX(1.306562965) li t5, 4433 // FIX_0_541196100 li t6, 7053 // FIX(0.860918669) mul t4, t0, t4 // tmp11 mul t5, t0, t5 // -tmp14 addu t7, t1, t2 // tmp10 addu t8, t7, t3 // tmp10 + z4 mul t6, t6, t8 // tmp15 li t8, 2139 // FIX(0.261052384) mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) li t7, 2295 // FIX(0.280143716) mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) addu t9, t2, t3 // z3 + z4 li s0, 8565 // FIX(1.045510580) mul t9, t9, s0 // -tmp13 li s0, 12112 // FIX(1.478575242) mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) li s1, 12998 // FIX(1.586706681) mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) li s2, 5540 // FIX(0.676326758) mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) li s3, 16244 // FIX(1.982889723) mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) subu t1, t1, t3 // z1-=z4 subu t0, t0, t2 // z2-=z3 addu t2, t0, t1 // z1+z2 li t3, 4433 // FIX_0_541196100 mul t2, t2, t3 // z3 li t3, 6270 // FIX_0_765366865 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) li t3, 15137 // FIX_0_765366865 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) addu t8, t6, t8 // tmp12 addu t3, t8, t4 // tmp12 + tmp11 addu t3, t3, t7 // tmp10 subu t8, t8, t9 // tmp12 + tmp13 addu s0, t5, s0 subu t8, t8, s0 // tmp12 subu t9, t6, t9 subu s1, s1, t4 addu t9, t9, s1 // tmp13 subu t6, t6, t5 subu t6, t6, s2 subu t6, t6, s3 // tmp15 // even part start lh t4, 64(a1) lh t5, 64(a0) lh t7, 32(a1) lh s0, 32(a0) lh s1, 0(a1) lh s2, 0(a0) lh s3, 96(a1) lh v0, 96(a0) mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) // odd part end addu t1, t2, t1 // tmp11 subu t0, t2, t0 // tmp14 // update counter and pointers addiu a3, a3, -1 addiu a0, a0, 2 addiu a1, a1, 2 // even part rest li s1, 10033 li s2, 11190 mul t4, t4, s1 // z4 mul s1, t5, s2 // z4 sll t5, t5, 13 // z1 sll t7, t7, 13 addiu t7, t7, 1024 // z3 sll s0, s0, 13 // z2 addu s2, t7, t4 // tmp10 subu t4, t7, t4 // tmp11 subu s3, t5, s0 // tmp12 addu t2, t7, s3 // tmp21 subu s3, t7, s3 // tmp24 addu t7, s1, s0 // tmp12 addu v0, s2, t7 // tmp20 subu s2, s2, t7 // tmp25 subu s1, s1, t5 // z4 - z1 subu s1, s1, s0 // tmp12 addu s0, t4, s1 // tmp22 subu t4, t4, s1 // tmp23 // final output stage addu t5, v0, t3 subu v0, v0, t3 addu t3, t2, t1 subu t2, t2, t1 addu t1, s0, t8 subu s0, s0, t8 addu t8, t4, t9 subu t4, t4, t9 addu t9, s3, t0 subu s3, s3, t0 addu t0, s2, t6 subu s2, s2, t6 sra t5, t5, 11 sra t3, t3, 11 sra t1, t1, 11 sra t8, t8, 11 sra t9, t9, 11 sra t0, t0, 11 sra s2, s2, 11 sra s3, s3, 11 sra t4, t4, 11 sra s0, s0, 11 sra t2, t2, 11 sra v0, v0, 11 sw t5, 0(a2) sw t3, 32(a2) sw t1, 64(a2) sw t8, 96(a2) sw t9, 128(a2) sw t0, 160(a2) sw s2, 192(a2) sw s3, 224(a2) sw t4, 256(a2) sw s0, 288(a2) sw t2, 320(a2) sw v0, 352(a2) bgtz a3, 1b addiu a2, a2, 4 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 j ra nop END(jsimd_idct_12x12_pass1_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) /* * a0 = workspace * a1 = output */ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 li a3, 12 1: // Odd part lw t0, 12(a0) lw t1, 4(a0) lw t2, 20(a0) lw t3, 28(a0) li t4, 10703 // FIX(1.306562965) li t5, 4433 // FIX_0_541196100 mul t4, t0, t4 // tmp11 mul t5, t0, t5 // -tmp14 addu t6, t1, t2 // tmp10 li t7, 2139 // FIX(0.261052384) mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) addu t6, t6, t3 // tmp10 + z4 li t8, 7053 // FIX(0.860918669) mul t6, t6, t8 // tmp15 li t8, 2295 // FIX(0.280143716) mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) addu t9, t2, t3 // z3 + z4 li s0, 8565 // FIX(1.045510580) mul t9, t9, s0 // -tmp13 li s0, 12112 // FIX(1.478575242) mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) li s1, 12998 // FIX(1.586706681) mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) li s2, 5540 // FIX(0.676326758) mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) li s3, 16244 // FIX(1.982889723) mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) subu t1, t1, t3 // z1 -= z4 subu t0, t0, t2 // z2 -= z3 addu t2, t1, t0 // z1 + z2 li t3, 4433 // FIX_0_541196100 mul t2, t2, t3 // z3 li t3, 6270 // FIX_0_765366865 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) li t3, 15137 // FIX_1_847759065 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) addu t3, t6, t7 // tmp12 addu t7, t3, t4 addu t7, t7, t8 // tmp10 subu t3, t3, t9 subu t3, t3, t5 subu t3, t3, s0 // tmp12 subu t9, t6, t9 subu t9, t9, t4 addu t9, t9, s1 // tmp13 subu t6, t6, t5 subu t6, t6, s2 subu t6, t6, s3 // tmp15 addu t1, t2, t1 // tmp11 subu t0, t2, t0 // tmp14 // even part lw t2, 16(a0) // z4 lw t4, 8(a0) // z1 lw t5, 0(a0) // z3 lw t8, 24(a0) // z2 li s0, 10033 // FIX(1.224744871) li s1, 11190 // FIX(1.366025404) mul t2, t2, s0 // z4 mul s0, t4, s1 // z4 addiu t5, t5, 0x10 sll t5, t5, 13 // z3 sll t4, t4, 13 // z1 sll t8, t8, 13 // z2 subu s1, t4, t8 // tmp12 addu s2, t5, t2 // tmp10 subu t2, t5, t2 // tmp11 addu s3, t5, s1 // tmp21 subu s1, t5, s1 // tmp24 addu t5, s0, t8 // tmp12 addu v0, s2, t5 // tmp20 subu t5, s2, t5 // tmp25 subu t4, s0, t4 subu t4, t4, t8 // tmp12 addu t8, t2, t4 // tmp22 subu t2, t2, t4 // tmp23 // increment counter and pointers addiu a3, a3, -1 addiu a0, a0, 32 // Final stage addu t4, v0, t7 subu v0, v0, t7 addu t7, s3, t1 subu s3, s3, t1 addu t1, t8, t3 subu t8, t8, t3 addu t3, t2, t9 subu t2, t2, t9 addu t9, s1, t0 subu s1, s1, t0 addu t0, t5, t6 subu t5, t5, t6 sll t4, t4, 4 sll t7, t7, 4 sll t1, t1, 4 sll t3, t3, 4 sll t9, t9, 4 sll t0, t0, 4 sll t5, t5, 4 sll s1, s1, 4 sll t2, t2, 4 sll t8, t8, 4 sll s3, s3, 4 sll v0, v0, 4 shll_s.w t4, t4, 2 shll_s.w t7, t7, 2 shll_s.w t1, t1, 2 shll_s.w t3, t3, 2 shll_s.w t9, t9, 2 shll_s.w t0, t0, 2 shll_s.w t5, t5, 2 shll_s.w s1, s1, 2 shll_s.w t2, t2, 2 shll_s.w t8, t8, 2 shll_s.w s3, s3, 2 shll_s.w v0, v0, 2 srl t4, t4, 24 srl t7, t7, 24 srl t1, t1, 24 srl t3, t3, 24 srl t9, t9, 24 srl t0, t0, 24 srl t5, t5, 24 srl s1, s1, 24 srl t2, t2, 24 srl t8, t8, 24 srl s3, s3, 24 srl v0, v0, 24 lw t6, 0(a1) addiu t4, t4, 0x80 addiu t7, t7, 0x80 addiu t1, t1, 0x80 addiu t3, t3, 0x80 addiu t9, t9, 0x80 addiu t0, t0, 0x80 addiu t5, t5, 0x80 addiu s1, s1, 0x80 addiu t2, t2, 0x80 addiu t8, t8, 0x80 addiu s3, s3, 0x80 addiu v0, v0, 0x80 sb t4, 0(t6) sb t7, 1(t6) sb t1, 2(t6) sb t3, 3(t6) sb t9, 4(t6) sb t0, 5(t6) sb t5, 6(t6) sb s1, 7(t6) sb t2, 8(t6) sb t8, 9(t6) sb s3, 10(t6) sb v0, 11(t6) bgtz a3, 1b addiu a1, a1, 4 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 jr ra nop END(jsimd_idct_12x12_pass2_dspr2) /*****************************************************************************/ LEAF_DSPR2(jsimd_convsamp_dspr2) /* * a0 = sample_data * a1 = start_col * a2 = workspace */ lw t0, 0(a0) li t7, 0xff80ff80 addu t0, t0, a1 ulw t1, 0(t0) ulw t2, 4(t0) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 lw t0, 4(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 0(a2) usw t4, 4(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 8(a2) usw t6, 12(a2) lw t0, 8(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 16(a2) usw t4, 20(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 24(a2) usw t6, 28(a2) lw t0, 12(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 32(a2) usw t4, 36(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 40(a2) usw t6, 44(a2) lw t0, 16(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 48(a2) usw t4, 52(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 56(a2) usw t6, 60(a2) lw t0, 20(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 64(a2) usw t4, 68(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 72(a2) usw t6, 76(a2) lw t0, 24(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 80(a2) usw t4, 84(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 88(a2) usw t6, 92(a2) lw t0, 28(a0) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu t0, t0, a1 addu.ph t3, t3, t7 addu.ph t4, t4, t7 ulw t1, 0(t0) ulw t2, 4(t0) addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 96(a2) usw t4, 100(a2) preceu.ph.qbr t3, t1 preceu.ph.qbl t4, t1 usw t5, 104(a2) usw t6, 108(a2) preceu.ph.qbr t5, t2 preceu.ph.qbl t6, t2 addu.ph t3, t3, t7 addu.ph t4, t4, t7 addu.ph t5, t5, t7 addu.ph t6, t6, t7 usw t3, 112(a2) usw t4, 116(a2) usw t5, 120(a2) usw t6, 124(a2) j ra nop END(jsimd_convsamp_dspr2) #ifndef __mips_soft_float /*****************************************************************************/ LEAF_DSPR2(jsimd_convsamp_float_dspr2) /* * a0 = sample_data * a1 = start_col * a2 = workspace */ .set at lw t0, 0(a0) addu t0, t0, a1 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 4(a0) swc1 f2, 0(a2) swc1 f4, 4(a2) swc1 f6, 8(a2) addu t0, t0, a1 swc1 f8, 12(a2) swc1 f10, 16(a2) swc1 f12, 20(a2) swc1 f14, 24(a2) swc1 f16, 28(a2) // elemr 1 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 8(a0) swc1 f2, 32(a2) swc1 f4, 36(a2) swc1 f6, 40(a2) addu t0, t0, a1 swc1 f8, 44(a2) swc1 f10, 48(a2) swc1 f12, 52(a2) swc1 f14, 56(a2) swc1 f16, 60(a2) // elemr 2 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 12(a0) swc1 f2, 64(a2) swc1 f4, 68(a2) swc1 f6, 72(a2) addu t0, t0, a1 swc1 f8, 76(a2) swc1 f10, 80(a2) swc1 f12, 84(a2) swc1 f14, 88(a2) swc1 f16, 92(a2) // elemr 3 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 16(a0) swc1 f2, 96(a2) swc1 f4, 100(a2) swc1 f6, 104(a2) addu t0, t0, a1 swc1 f8, 108(a2) swc1 f10, 112(a2) swc1 f12, 116(a2) swc1 f14, 120(a2) swc1 f16, 124(a2) // elemr 4 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 20(a0) swc1 f2, 128(a2) swc1 f4, 132(a2) swc1 f6, 136(a2) addu t0, t0, a1 swc1 f8, 140(a2) swc1 f10, 144(a2) swc1 f12, 148(a2) swc1 f14, 152(a2) swc1 f16, 156(a2) // elemr 5 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 24(a0) swc1 f2, 160(a2) swc1 f4, 164(a2) swc1 f6, 168(a2) addu t0, t0, a1 swc1 f8, 172(a2) swc1 f10, 176(a2) swc1 f12, 180(a2) swc1 f14, 184(a2) swc1 f16, 188(a2) // elemr 6 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 lw t0, 28(a0) swc1 f2, 192(a2) swc1 f4, 196(a2) swc1 f6, 200(a2) addu t0, t0, a1 swc1 f8, 204(a2) swc1 f10, 208(a2) swc1 f12, 212(a2) swc1 f14, 216(a2) swc1 f16, 220(a2) // elemr 7 lbu t1, 0(t0) lbu t2, 1(t0) lbu t3, 2(t0) lbu t4, 3(t0) lbu t5, 4(t0) lbu t6, 5(t0) lbu t7, 6(t0) lbu t8, 7(t0) addiu t1, t1, -128 addiu t2, t2, -128 addiu t3, t3, -128 addiu t4, t4, -128 addiu t5, t5, -128 addiu t6, t6, -128 addiu t7, t7, -128 addiu t8, t8, -128 mtc1 t1, f2 mtc1 t2, f4 mtc1 t3, f6 mtc1 t4, f8 mtc1 t5, f10 mtc1 t6, f12 mtc1 t7, f14 mtc1 t8, f16 cvt.s.w f2, f2 cvt.s.w f4, f4 cvt.s.w f6, f6 cvt.s.w f8, f8 cvt.s.w f10, f10 cvt.s.w f12, f12 cvt.s.w f14, f14 cvt.s.w f16, f16 swc1 f2, 224(a2) swc1 f4, 228(a2) swc1 f6, 232(a2) swc1 f8, 236(a2) swc1 f10, 240(a2) swc1 f12, 244(a2) swc1 f14, 248(a2) swc1 f16, 252(a2) j ra nop END(jsimd_convsamp_float_dspr2) #endif /*****************************************************************************/