/*
 * MIPS DSPr2 optimizations for libjpeg-turbo
 *
 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
 *                          All Rights Reserved.
 * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
 *           Darko Laus       <darko.laus@imgtec.com>
 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

#include "jsimd_dspr2_asm.h"


/*****************************************************************************/
LEAF_DSPR2(jsimd_c_null_convert_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = input_buf
 * a2     = output_buf
 * a3     = output_row
 * 16(sp) = num_rows
 * 20(sp) = cinfo->num_components
 *
 * Null conversion for compression
 */
    SAVE_REGS_ON_STACK 8, s0, s1

    lw          t9, 24(sp)      // t9 = num_rows
    lw          s0, 28(sp)      // s0 = cinfo->num_components
    andi        t0, a0, 3       // t0 = cinfo->image_width & 3
    beqz        t0, 4f          // no residual
     nop
0:
    addiu       t9, t9, -1
    bltz        t9, 7f
     li         t1, 0
1:
    sll         t3, t1, 2
    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
    lw          t2, 0(a1)       // t2 = inptr = *input_buf
    sll         t4, a3, 2
    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
    addu        t2, t2, t1
    addu        s1, t5, a0
    addu        t6, t5, t0
2:
    lbu         t3, 0(t2)
    addiu       t5, t5, 1
    sb          t3, -1(t5)
    bne         t6, t5, 2b
     addu       t2, t2, s0
3:
    lbu         t3, 0(t2)
    addu        t4, t2, s0
    addu        t7, t4, s0
    addu        t8, t7, s0
    addu        t2, t8, s0
    lbu         t4, 0(t4)
    lbu         t7, 0(t7)
    lbu         t8, 0(t8)
    addiu       t5, t5, 4
    sb          t3, -4(t5)
    sb          t4, -3(t5)
    sb          t7, -2(t5)
    bne         s1, t5, 3b
     sb         t8, -1(t5)
    addiu       t1, t1, 1
    bne         t1, s0, 1b
     nop
    addiu       a1, a1, 4
    bgez        t9, 0b
     addiu      a3, a3, 1
    b           7f
     nop
4:
    addiu       t9, t9, -1
    bltz        t9, 7f
     li         t1, 0
5:
    sll         t3, t1, 2
    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
    lw          t2, 0(a1)       // t2 = inptr = *input_buf
    sll         t4, a3, 2
    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
    addu        t2, t2, t1
    addu        s1, t5, a0
    addu        t6, t5, t0
6:
    lbu         t3, 0(t2)
    addu        t4, t2, s0
    addu        t7, t4, s0
    addu        t8, t7, s0
    addu        t2, t8, s0
    lbu         t4, 0(t4)
    lbu         t7, 0(t7)
    lbu         t8, 0(t8)
    addiu       t5, t5, 4
    sb          t3, -4(t5)
    sb          t4, -3(t5)
    sb          t7, -2(t5)
    bne         s1, t5, 6b
     sb         t8, -1(t5)
    addiu       t1, t1, 1
    bne         t1, s0, 5b
     nop
    addiu       a1, a1, 4
    bgez        t9, 4b
     addiu      a3, a3, 1
7:
    RESTORE_REGS_FROM_STACK 8, s0, s1

    j           ra
     nop

END(jsimd_c_null_convert_dspr2)


/*****************************************************************************/
/*
 * jsimd_extrgb_ycc_convert_dspr2
 * jsimd_extbgr_ycc_convert_dspr2
 * jsimd_extrgbx_ycc_convert_dspr2
 * jsimd_extbgrx_ycc_convert_dspr2
 * jsimd_extxbgr_ycc_convert_dspr2
 * jsimd_extxrgb_ycc_convert_dspr2
 *
 * Colorspace conversion RGB -> YCbCr
 */

.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
                                             r_offs, g_offs, b_offs

.macro DO_RGB_TO_YCC  r, g, b, inptr
    lbu         \r, \r_offs(\inptr)
    lbu         \g, \g_offs(\inptr)
    lbu         \b, \b_offs(\inptr)
    addiu       \inptr, \pixel_size
.endm

LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = input_buf
 * a2     = output_buf
 * a3     = output_row
 * 16(sp) = num_rows
 */
    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    lw          t7, 48(sp)      // t7 = num_rows
    li          s0, 0x4c8b      // FIX(0.29900)
    li          s1, 0x9646      // FIX(0.58700)
    li          s2, 0x1d2f      // FIX(0.11400)
    li          s3, 0xffffd4cd  // -FIX(0.16874)
    li          s4, 0xffffab33  // -FIX(0.33126)
    li          s5, 0x8000      // FIX(0.50000)
    li          s6, 0xffff94d1  // -FIX(0.41869)
    li          s7, 0xffffeb2f  // -FIX(0.08131)
    li          t8, 0x807fff    // CBCR_OFFSET + ONE_HALF-1

0:
    addiu       t7, -1          // --num_rows
    lw          t6, 0(a1)       // t6 = input_buf[0]
    lw          t0, 0(a2)
    lw          t1, 4(a2)
    lw          t2, 8(a2)
    sll         t3, a3, 2
    lwx         t0, t3(t0)      // t0 = output_buf[0][output_row]
    lwx         t1, t3(t1)      // t1 = output_buf[1][output_row]
    lwx         t2, t3(t2)      // t2 = output_buf[2][output_row]

    addu        t9, t2, a0      // t9 = end address
    addiu       a3, 1

1:
    DO_RGB_TO_YCC t3, t4, t5, t6

    mtlo        s5, $ac0
    mtlo        t8, $ac1
    mtlo        t8, $ac2
    maddu       $ac0, s2, t5
    maddu       $ac1, s5, t5
    maddu       $ac2, s5, t3
    maddu       $ac0, s0, t3
    maddu       $ac1, s3, t3
    maddu       $ac2, s6, t4
    maddu       $ac0, s1, t4
    maddu       $ac1, s4, t4
    maddu       $ac2, s7, t5
    extr.w      t3, $ac0, 16
    extr.w      t4, $ac1, 16
    extr.w      t5, $ac2, 16
    sb          t3, 0(t0)
    sb          t4, 0(t1)
    sb          t5, 0(t2)
    addiu       t0, 1
    addiu       t2, 1
    bne         t2, t9, 1b
     addiu      t1, 1
    bgtz        t7, 0b
     addiu      a1, 4

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop
END(jsimd_\colorid\()_ycc_convert_dspr2)

.purgem DO_RGB_TO_YCC

.endm

/*-------------------------------------id -- pix R  G  B */
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3


/*****************************************************************************/
/*
 * jsimd_ycc_extrgb_convert_dspr2
 * jsimd_ycc_extbgr_convert_dspr2
 * jsimd_ycc_extrgbx_convert_dspr2
 * jsimd_ycc_extbgrx_convert_dspr2
 * jsimd_ycc_extxbgr_convert_dspr2
 * jsimd_ycc_extxrgb_convert_dspr2
 *
 * Colorspace conversion YCbCr -> RGB
 */

.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
                                             r_offs, g_offs, b_offs, a_offs

.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
    sb          \scratch0, \r_offs(\outptr)
    sb          \scratch1, \g_offs(\outptr)
    sb          \scratch2, \b_offs(\outptr)
.if (\pixel_size == 4)
    li          t0, 0xFF
    sb          t0, \a_offs(\outptr)
.endif
    addiu       \outptr, \pixel_size
.endm

LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = input_buf
 * a2     = input_row
 * a3     = output_buf
 * 16(sp) = num_rows
 */
    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    lw          s1, 48(sp)
    li          t3, 0x8000
    li          t4, 0x166e9     // FIX(1.40200)
    li          t5, 0x1c5a2     // FIX(1.77200)
    li          t6, 0xffff492e  // -FIX(0.71414)
    li          t7, 0xffffa7e6  // -FIX(0.34414)
    repl.ph     t8, 128

0:
    lw          s0, 0(a3)
    lw          t0, 0(a1)
    lw          t1, 4(a1)
    lw          t2, 8(a1)
    sll         s5, a2, 2
    addiu       s1, -1
    lwx         s2, s5(t0)
    lwx         s3, s5(t1)
    lwx         s4, s5(t2)
    addu        t9, s2, a0
    addiu       a2, 1

1:
    lbu         s7, 0(s4)       // cr
    lbu         s6, 0(s3)       // cb
    lbu         s5, 0(s2)       // y
    addiu       s2, 1
    addiu       s4, 1
    addiu       s7, -128
    addiu       s6, -128
    mul         t2, t7, s6
    mul         t0, t6, s7      // Crgtab[cr]
    sll         s7, 15
    mulq_rs.w   t1, t4, s7      // Crrtab[cr]
    sll         s6, 15
    addu        t2, t3          // Cbgtab[cb]
    addu        t2, t0

    mulq_rs.w   t0, t5, s6      // Cbbtab[cb]
    sra         t2, 16
    addu        t1, s5
    addu        t2, s5          // add y
    ins         t2, t1, 16, 16
    subu.ph     t2, t2, t8
    addu        t0, s5
    shll_s.ph   t2, t2, 8
    subu        t0, 128
    shra.ph     t2, t2, 8
    shll_s.w    t0, t0, 24
    addu.ph     t2, t2, t8      // clip & store
    sra         t0, t0, 24
    sra         t1, t2, 16
    addiu       t0, 128

    STORE_YCC_TO_RGB t1, t2, t0, s0

    bne         s2, t9, 1b
     addiu      s3, 1
    bgtz        s1, 0b
     addiu      a3, 4

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop
END(jsimd_ycc_\colorid\()_convert_dspr2)

.purgem STORE_YCC_TO_RGB

.endm

/*-------------------------------------id -- pix R  G  B  A */
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0


/*****************************************************************************/
/*
 * jsimd_extrgb_gray_convert_dspr2
 * jsimd_extbgr_gray_convert_dspr2
 * jsimd_extrgbx_gray_convert_dspr2
 * jsimd_extbgrx_gray_convert_dspr2
 * jsimd_extxbgr_gray_convert_dspr2
 * jsimd_extxrgb_gray_convert_dspr2
 *
 * Colorspace conversion RGB -> GRAY
 */

.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
                                              r_offs, g_offs, b_offs

.macro DO_RGB_TO_GRAY  r, g, b, inptr
    lbu         \r, \r_offs(\inptr)
    lbu         \g, \g_offs(\inptr)
    lbu         \b, \b_offs(\inptr)
    addiu       \inptr, \pixel_size
.endm

LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = input_buf
 * a2     = output_buf
 * a3     = output_row
 * 16(sp) = num_rows
 */
    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    li          s0, 0x4c8b      // s0 = FIX(0.29900)
    li          s1, 0x9646      // s1 = FIX(0.58700)
    li          s2, 0x1d2f      // s2 = FIX(0.11400)
    li          s7, 0x8000      // s7 = FIX(0.50000)
    lw          s6, 48(sp)
    andi        t7, a0, 3

0:
    addiu       s6, -1          // s6 = num_rows
    lw          t0, 0(a1)
    lw          t1, 0(a2)
    sll         t3, a3, 2
    lwx         t1, t3(t1)
    addiu       a3, 1
    addu        t9, t1, a0
    subu        t8, t9, t7
    beq         t1, t8, 2f
     nop

1:
    DO_RGB_TO_GRAY t3, t4, t5, t0
    DO_RGB_TO_GRAY s3, s4, s5, t0

    mtlo        s7, $ac0
    maddu       $ac0, s2, t5
    maddu       $ac0, s1, t4
    maddu       $ac0, s0, t3
    mtlo        s7, $ac1
    maddu       $ac1, s2, s5
    maddu       $ac1, s1, s4
    maddu       $ac1, s0, s3
    extr.w      t6, $ac0, 16

    DO_RGB_TO_GRAY t3, t4, t5, t0
    DO_RGB_TO_GRAY s3, s4, s5, t0

    mtlo        s7, $ac0
    maddu       $ac0, s2, t5
    maddu       $ac0, s1, t4
    extr.w      t2, $ac1, 16
    maddu       $ac0, s0, t3
    mtlo        s7, $ac1
    maddu       $ac1, s2, s5
    maddu       $ac1, s1, s4
    maddu       $ac1, s0, s3
    extr.w      t5, $ac0, 16
    sb          t6, 0(t1)
    sb          t2, 1(t1)
    extr.w      t3, $ac1, 16
    addiu       t1, 4
    sb          t5, -2(t1)
    sb          t3, -1(t1)
    bne         t1, t8, 1b
     nop

2:
    beqz        t7, 4f
     nop

3:
    DO_RGB_TO_GRAY t3, t4, t5, t0

    mtlo        s7, $ac0
    maddu       $ac0, s2, t5
    maddu       $ac0, s1, t4
    maddu       $ac0, s0, t3
    extr.w      t6, $ac0, 16
    sb          t6, 0(t1)
    addiu       t1, 1
    bne         t1, t9, 3b
     nop

4:
    bgtz        s6, 0b
     addiu      a1, 4

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop
END(jsimd_\colorid\()_gray_convert_dspr2)

.purgem DO_RGB_TO_GRAY

.endm

/*-------------------------------------id --  pix R  G  B */
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3


/*****************************************************************************/
/*
 * jsimd_h2v2_merged_upsample_dspr2
 * jsimd_h2v2_extrgb_merged_upsample_dspr2
 * jsimd_h2v2_extrgbx_merged_upsample_dspr2
 * jsimd_h2v2_extbgr_merged_upsample_dspr2
 * jsimd_h2v2_extbgrx_merged_upsample_dspr2
 * jsimd_h2v2_extxbgr_merged_upsample_dspr2
 * jsimd_h2v2_extxrgb_merged_upsample_dspr2
 *
 * Merged h2v2 upsample routines
 */
.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
                                            r1_offs, g1_offs, \
                                            b1_offs, a1_offs, \
                                            r2_offs, g2_offs, \
                                            b2_offs, a2_offs

.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
                            scratch5 outptr
    sb          \scratch0, \r1_offs(\outptr)
    sb          \scratch1, \g1_offs(\outptr)
    sb          \scratch2, \b1_offs(\outptr)
    sb          \scratch3, \r2_offs(\outptr)
    sb          \scratch4, \g2_offs(\outptr)
    sb          \scratch5, \b2_offs(\outptr)
.if (\pixel_size == 8)
    li          \scratch0, 0xFF
    sb          \scratch0, \a1_offs(\outptr)
    sb          \scratch0, \a2_offs(\outptr)
.endif
    addiu       \outptr, \pixel_size
.endm

.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
    sb          \scratch0, \r1_offs(\outptr)
    sb          \scratch1, \g1_offs(\outptr)
    sb          \scratch2, \b1_offs(\outptr)

.if (\pixel_size == 8)
    li          t0, 0xFF
    sb          t0, \a1_offs(\outptr)
.endif
.endm

LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
/*
 * a0     = cinfo->output_width
 * a1     = input_buf
 * a2     = in_row_group_ctr
 * a3     = output_buf
 * 16(sp) = cinfo->sample_range_limit
 */
    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

    lw          t9, 56(sp)      // cinfo->sample_range_limit
    lw          v0, 0(a1)
    lw          v1, 4(a1)
    lw          t0, 8(a1)
    sll         t1, a2, 3
    addiu       t2, t1, 4
    sll         t3, a2, 2
    lw          t4, 0(a3)       // t4 = output_buf[0]
    lwx         t1, t1(v0)      // t1 = input_buf[0][in_row_group_ctr*2]
    lwx         t2, t2(v0)      // t2 = input_buf[0][in_row_group_ctr*2 + 1]
    lwx         t5, t3(v1)      // t5 = input_buf[1][in_row_group_ctr]
    lwx         t6, t3(t0)      // t6 = input_buf[2][in_row_group_ctr]
    lw          t7, 4(a3)       // t7 = output_buf[1]
    li          s1, 0xe6ea
    addiu       t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
    addiu       s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
    addiu       s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
    xori        s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
    srl         t3, a0, 1
    blez        t3, 2f
     addu       t0, t5, t3      // t0 = end address
 1:
    lbu         t3, 0(t5)
    lbu         s3, 0(t6)
    addiu       t5, t5, 1
    addiu       t3, t3, -128    // (cb - 128)
    addiu       s3, s3, -128    // (cr - 128)
    mult        $ac1, s1, t3
    madd        $ac1, s2, s3
    sll         s3, s3, 15
    sll         t3, t3, 15
    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
    extr_r.w    s5, $ac1, 16
    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
    lbu         v0, 0(t1)
    addiu       t6, t6, 1
    addiu       t1, t1, 2
    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         AT, 0(t3)
    lbu         s7, 0(s3)
    lbu         ra, 0(v1)
    lbu         v0, -1(t1)
    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         t3, 0(t3)
    lbu         s3, 0(s3)
    lbu         v1, 0(v1)
    lbu         v0, 0(t2)

    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4

    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         AT, 0(t3)
    lbu         s7, 0(s3)
    lbu         ra, 0(v1)
    lbu         v0, 1(t2)
    addiu       t2, t2, 2
    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         t3, 0(t3)
    lbu         s3, 0(s3)
    lbu         v1, 0(v1)

    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7

    bne         t0, t5, 1b
     nop
2:
    andi        t0, a0, 1
    beqz        t0, 4f
     lbu        t3, 0(t5)
    lbu         s3, 0(t6)
    addiu       t3, t3, -128    // (cb - 128)
    addiu       s3, s3, -128    // (cr - 128)
    mult        $ac1, s1, t3
    madd        $ac1, s2, s3
    sll         s3, s3, 15
    sll         t3, t3, 15
    lbu         v0, 0(t1)
    extr_r.w    s5, $ac1, 16
    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         t3, 0(t3)
    lbu         s3, 0(s3)
    lbu         v1, 0(v1)
    lbu         v0, 0(t2)

    STORE_H2V2_1_PIXEL t3, s3, v1, t4

    addu        t3, v0, s4      // y+cred
    addu        s3, v0, s5      // y+cgreen
    addu        v1, v0, s6      // y+cblue
    addu        t3, t9, t3      // y+cred
    addu        s3, t9, s3      // y+cgreen
    addu        v1, t9, v1      // y+cblue
    lbu         t3, 0(t3)
    lbu         s3, 0(s3)
    lbu         v1, 0(v1)

    STORE_H2V2_1_PIXEL t3, s3, v1, t7
4:
    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

    j           ra
     nop

END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)

.purgem STORE_H2V2_1_PIXEL
.purgem STORE_H2V2_2_PIXELS
.endm

/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4


/*****************************************************************************/
/*
 * jsimd_h2v1_merged_upsample_dspr2
 * jsimd_h2v1_extrgb_merged_upsample_dspr2
 * jsimd_h2v1_extrgbx_merged_upsample_dspr2
 * jsimd_h2v1_extbgr_merged_upsample_dspr2
 * jsimd_h2v1_extbgrx_merged_upsample_dspr2
 * jsimd_h2v1_extxbgr_merged_upsample_dspr2
 * jsimd_h2v1_extxrgb_merged_upsample_dspr2
 *
 * Merged h2v1 upsample routines
 */

.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
                                            r1_offs, g1_offs, \
                                            b1_offs, a1_offs, \
                                            r2_offs, g2_offs, \
                                            b2_offs, a2_offs

.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
                            scratch5 outptr
    sb          \scratch0, \r1_offs(\outptr)
    sb          \scratch1, \g1_offs(\outptr)
    sb          \scratch2, \b1_offs(\outptr)
    sb          \scratch3, \r2_offs(\outptr)
    sb          \scratch4, \g2_offs(\outptr)
    sb          \scratch5, \b2_offs(\outptr)
.if (\pixel_size == 8)
    li          t0, 0xFF
    sb          t0, \a1_offs(\outptr)
    sb          t0, \a2_offs(\outptr)
.endif
    addiu       \outptr, \pixel_size
.endm

.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
    sb          \scratch0, \r1_offs(\outptr)
    sb          \scratch1, \g1_offs(\outptr)
    sb          \scratch2, \b1_offs(\outptr)
.if (\pixel_size == 8)
    li          t0, 0xFF
    sb          t0, \a1_offs(\outptr)
.endif
.endm

LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
/*
 * a0     = cinfo->output_width
 * a1     = input_buf
 * a2     = in_row_group_ctr
 * a3     = output_buf
 * 16(sp) = range_limit
 */
    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

    li          t0, 0xe6ea
    lw          t1, 0(a1)         // t1 = input_buf[0]
    lw          t2, 4(a1)         // t2 = input_buf[1]
    lw          t3, 8(a1)         // t3 = input_buf[2]
    lw          t8, 56(sp)        // t8 = range_limit
    addiu       s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
    addiu       s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
    addiu       s0, t0, 0x9916    // s0 = 0x8000
    addiu       s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
    xori        s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
    srl         t0, a0, 1
    sll         t4, a2, 2
    lwx         s5, t4(t1)      // s5 = inptr0
    lwx         s6, t4(t2)      // s6 = inptr1
    lwx         s7, t4(t3)      // s7 = inptr2
    lw          t7, 0(a3)       // t7 = outptr
    blez        t0, 2f
     addu       t9, s6, t0      // t9 = end address
1:
    lbu         t2, 0(s6)       // t2 = cb
    lbu         t0, 0(s7)       // t0 = cr
    lbu         t1, 0(s5)       // t1 = y
    addiu       t2, t2, -128    // t2 = cb - 128
    addiu       t0, t0, -128    // t0 = cr - 128
    mult        $ac1, s4, t2
    madd        $ac1, s3, t0
    sll         t0, t0, 15
    sll         t2, t2, 15
    mulq_rs.w   t0, s1, t0      // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
    extr_r.w    t5, $ac1, 16
    mulq_rs.w   t6, s2, t2      // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
    addiu       s7, s7, 1
    addiu       s6, s6, 1
    addu        t2, t1, t0      // t2 = y + cred
    addu        t3, t1, t5      // t3 = y + cgreen
    addu        t4, t1, t6      // t4 = y + cblue
    addu        t2, t8, t2
    addu        t3, t8, t3
    addu        t4, t8, t4
    lbu         t1, 1(s5)
    lbu         v0, 0(t2)
    lbu         v1, 0(t3)
    lbu         ra, 0(t4)
    addu        t2, t1, t0
    addu        t3, t1, t5
    addu        t4, t1, t6
    addu        t2, t8, t2
    addu        t3, t8, t3
    addu        t4, t8, t4
    lbu         t2, 0(t2)
    lbu         t3, 0(t3)
    lbu         t4, 0(t4)

    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7

    bne         t9, s6, 1b
     addiu      s5, s5, 2
2:
    andi        t0, a0, 1
    beqz        t0, 4f
     nop
3:
    lbu         t2, 0(s6)
    lbu         t0, 0(s7)
    lbu         t1, 0(s5)
    addiu       t2, t2, -128    // (cb - 128)
    addiu       t0, t0, -128    // (cr - 128)
    mul         t3, s4, t2
    mul         t4, s3, t0
    sll         t0, t0, 15
    sll         t2, t2, 15
    mulq_rs.w   t0, s1, t0      // (C1*cr + ONE_HALF)>> SCALEBITS
    mulq_rs.w   t6, s2, t2      // (C2*cb + ONE_HALF)>> SCALEBITS
    addu        t3, t3, s0
    addu        t3, t4, t3
    sra         t5, t3, 16      // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
    addu        t2, t1, t0      // y + cred
    addu        t3, t1, t5      // y + cgreen
    addu        t4, t1, t6      // y + cblue
    addu        t2, t8, t2
    addu        t3, t8, t3
    addu        t4, t8, t4
    lbu         t2, 0(t2)
    lbu         t3, 0(t3)
    lbu         t4, 0(t4)

    STORE_H2V1_1_PIXEL t2, t3, t4, t7
4:
    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

    j           ra
     nop

END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)

.purgem STORE_H2V1_1_PIXEL
.purgem STORE_H2V1_2_PIXELS
.endm

/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4


/*****************************************************************************/
/*
 * jsimd_h2v2_fancy_upsample_dspr2
 *
 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 */
LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
/*
 * a0 = cinfo->max_v_samp_factor
 * a1 = downsampled_width
 * a2 = input_data
 * a3 = output_data_ptr
 */
    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5

    li            s4, 0
    lw            s2, 0(a3)       // s2 = *output_data_ptr
0:
    li            t9, 2
    lw            s1, -4(a2)      // s1 = inptr1

1:
    lw            s0, 0(a2)       // s0 = inptr0
    lwx           s3, s4(s2)
    addiu         s5, a1, -2      // s5 = downsampled_width - 2
    srl           t4, s5, 1
    sll           t4, t4, 1
    lbu           t0, 0(s0)
    lbu           t1, 1(s0)
    lbu           t2, 0(s1)
    lbu           t3, 1(s1)
    addiu         s0, 2
    addiu         s1, 2
    addu          t8, s0, t4      // t8 = end address
    andi          s5, s5, 1       // s5 = residual
    sll           t4, t0, 1
    sll           t6, t1, 1
    addu          t0, t0, t4      // t0 = (*inptr0++) * 3
    addu          t1, t1, t6      // t1 = (*inptr0++) * 3
    addu          t7, t0, t2      // t7 = thiscolsum
    addu          t6, t1, t3      // t5 = nextcolsum
    sll           t0, t7, 2       // t0 = thiscolsum * 4
    subu          t1, t0, t7      // t1 = thiscolsum * 3
    shra_r.w      t0, t0, 4
    addiu         t1, 7
    addu          t1, t1, t6
    srl           t1, t1, 4
    sb            t0, 0(s3)
    sb            t1, 1(s3)
    beq           t8, s0, 22f     // skip to final iteration if width == 3
     addiu        s3, 2
2:
    lh            t0, 0(s0)       // t0 = A3|A2
    lh            t2, 0(s1)       // t2 = B3|B2
    addiu         s0, 2
    addiu         s1, 2
    preceu.ph.qbr t0, t0          // t0 = 0|A3|0|A2
    preceu.ph.qbr t2, t2          // t2 = 0|B3|0|B2
    shll.ph       t1, t0, 1
    sll           t3, t6, 1
    addu.ph       t0, t1, t0      // t0 = A3*3|A2*3
    addu          t3, t3, t6      // t3 = this * 3
    addu.ph       t0, t0, t2      // t0 = next2|next1
    addu          t1, t3, t7
    andi          t7, t0, 0xFFFF  // t7 = next1
    sll           t2, t7, 1
    addu          t2, t7, t2      // t2 = next1*3
    addu          t4, t2, t6
    srl           t6, t0, 16      // t6 = next2
    shra_r.w      t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
    addu          t0, t3, t7
    addiu         t0, 7
    srl           t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
    shra_r.w      t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
    addu          t2, t2, t6
    addiu         t2, 7
    srl           t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
    sb            t1, 0(s3)
    sb            t0, 1(s3)
    sb            t4, 2(s3)
    sb            t2, 3(s3)
    bne           t8, s0, 2b
     addiu        s3, 4
22:
    beqz          s5, 4f
     addu         t8, s0, s5
3:
    lbu           t0, 0(s0)
    lbu           t2, 0(s1)
    addiu         s0, 1
    addiu         s1, 1
    sll           t3, t6, 1
    sll           t1, t0, 1
    addu          t1, t0, t1      // t1 = inptr0 * 3
    addu          t3, t3, t6      // t3 = thiscolsum * 3
    addu          t5, t1, t2
    addu          t1, t3, t7
    shra_r.w      t1, t1, 4
    addu          t0, t3, t5
    addiu         t0, 7
    srl           t0, t0, 4
    sb            t1, 0(s3)
    sb            t0, 1(s3)
    addiu         s3, 2
    move          t7, t6
    bne           t8, s0, 3b
     move         t6, t5
4:
    sll           t0, t6, 2       // t0 = thiscolsum * 4
    subu          t1, t0, t6      // t1 = thiscolsum * 3
    addu          t1, t1, t7
    addiu         s4, 4
    shra_r.w      t1, t1, 4
    addiu         t0, 7
    srl           t0, t0, 4
    sb            t1, 0(s3)
    sb            t0, 1(s3)
    addiu         t9, -1
    addiu         s3, 2
    bnez          t9, 1b
     lw           s1, 4(a2)
    srl           t0, s4, 2
    subu          t0, a0, t0
    bgtz          t0, 0b
     addiu        a2, 4

    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5

    j             ra
     nop
END(jsimd_h2v2_fancy_upsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
/*
 * a0 = cinfo->max_v_samp_factor
 * a1 = downsampled_width
 * a2 = input_data
 * a3 = output_data_ptr
 */
    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

    .set at

    beqz          a0, 3f
     sll          t0, a0, 2
    lw            s1, 0(a3)
    li            s3, 0x10001
    addu          s0, s1, t0
0:
    addiu         t8, a1, -2
    srl           t9, t8, 2
    lw            t7, 0(a2)
    lw            s2, 0(s1)
    lbu           t0, 0(t7)
    lbu           t1, 1(t7)       // t1 = inptr[1]
    sll           t2, t0, 1
    addu          t2, t2, t0      // t2 = invalue*3
    addu          t2, t2, t1
    shra_r.w      t2, t2, 2
    sb            t0, 0(s2)
    sb            t2, 1(s2)
    beqz          t9, 11f
     addiu        s2, 2
1:
    ulw           t0, 0(t7)       // t0 = |P3|P2|P1|P0|
    ulw           t1, 1(t7)
    ulh           t2, 4(t7)       // t2 = |0|0|P5|P4|
    preceu.ph.qbl t3, t0          // t3 = |0|P3|0|P2|
    preceu.ph.qbr t0, t0          // t0 = |0|P1|0|P0|
    preceu.ph.qbr t2, t2          // t2 = |0|P5|0|P4|
    preceu.ph.qbl t4, t1          // t4 = |0|P4|0|P3|
    preceu.ph.qbr t1, t1          // t1 = |0|P2|0|P1|
    shll.ph       t5, t4, 1
    shll.ph       t6, t1, 1
    addu.ph       t5, t5, t4      // t5 = |P4*3|P3*3|
    addu.ph       t6, t6, t1      // t6 = |P2*3|P1*3|
    addu.ph       t4, t3, s3
    addu.ph       t0, t0, s3
    addu.ph       t4, t4, t5
    addu.ph       t0, t0, t6
    shrl.ph       t4, t4, 2       // t4 = |0|P3|0|P2|
    shrl.ph       t0, t0, 2       // t0 = |0|P1|0|P0|
    addu.ph       t2, t2, t5
    addu.ph       t3, t3, t6
    shra_r.ph     t2, t2, 2       // t2 = |0|P5|0|P4|
    shra_r.ph     t3, t3, 2       // t3 = |0|P3|0|P2|
    shll.ph       t2, t2, 8
    shll.ph       t3, t3, 8
    or            t2, t4, t2
    or            t3, t3, t0
    addiu         t9, -1
    usw           t3, 0(s2)
    usw           t2, 4(s2)
    addiu         s2, 8
    bgtz          t9, 1b
     addiu        t7, 4
11:
    andi          t8, 3
    beqz          t8, 22f
     addiu        t7, 1

2:
    lbu           t0, 0(t7)
    addiu         t7, 1
    sll           t1, t0, 1
    addu          t2, t0, t1      // t2 = invalue
    lbu           t3, -2(t7)
    lbu           t4, 0(t7)
    addiu         t3, 1
    addiu         t4, 2
    addu          t3, t3, t2
    addu          t4, t4, t2
    srl           t3, 2
    srl           t4, 2
    sb            t3, 0(s2)
    sb            t4, 1(s2)
    addiu         t8, -1
    bgtz          t8, 2b
     addiu        s2, 2

22:
    lbu           t0, 0(t7)
    lbu           t2, -1(t7)
    sll           t1, t0, 1
    addu          t1, t1, t0      // t1 = invalue * 3
    addu          t1, t1, t2
    addiu         t1, 1
    srl           t1, t1, 2
    sb            t1, 0(s2)
    sb            t0, 1(s2)
    addiu         s1, 4
    bne           s1, s0, 0b
     addiu        a2, 4
3:
    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

    j             ra
     nop
END(jsimd_h2v1_fancy_upsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = cinfo->max_v_samp_factor
 * a2     = compptr->v_samp_factor
 * a3     = compptr->width_in_blocks
 * 16(sp) = input_data
 * 20(sp) = output_data
 */
    .set at

    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4

    beqz        a2, 7f
     lw         s1, 44(sp)      // s1 = output_data
    lw          s0, 40(sp)      // s0 = input_data
    srl         s2, a0, 2
    andi        t9, a0, 2
    srl         t7, t9, 1
    addu        s2, t7, s2
    sll         t0, a3, 3       // t0 = width_in_blocks*DCT
    srl         t7, t0, 1
    subu        s2, t7, s2
0:
    andi        t6, a0, 1       // t6 = temp_index
    addiu       t6, -1
    lw          t4, 0(s1)       // t4 = outptr
    lw          t5, 0(s0)       // t5 = inptr0
    li          s3, 0           // s3 = bias
    srl         t7, a0, 1       // t7 = image_width1
    srl         s4, t7, 2
    andi        t8, t7, 3
1:
    ulhu        t0, 0(t5)
    ulhu        t1, 2(t5)
    ulhu        t2, 4(t5)
    ulhu        t3, 6(t5)
    raddu.w.qb  t0, t0
    raddu.w.qb  t1, t1
    raddu.w.qb  t2, t2
    raddu.w.qb  t3, t3
    shra.ph     t0, t0, 1
    shra_r.ph   t1, t1, 1
    shra.ph     t2, t2, 1
    shra_r.ph   t3, t3, 1
    sb          t0, 0(t4)
    sb          t1, 1(t4)
    sb          t2, 2(t4)
    sb          t3, 3(t4)
    addiu       s4, -1
    addiu       t4, 4
    bgtz        s4, 1b
     addiu      t5, 8
    beqz        t8, 3f
     addu       s4, t4, t8
2:
    ulhu        t0, 0(t5)
    raddu.w.qb  t0, t0
    addqh.w     t0, t0, s3
    xori        s3, s3, 1
    sb          t0, 0(t4)
    addiu       t4, 1
    bne         t4, s4, 2b
     addiu      t5, 2
3:
    lbux        t1, t6(t5)
    sll         t1, 1
    addqh.w     t2, t1, s3      // t2 = pixval1
    xori        s3, s3, 1
    addqh.w     t3, t1, s3      // t3 = pixval2
    blez        s2, 5f
     append     t3, t2,  8
    addu        t5, t4, s2      // t5 = loop_end2
4:
    ush         t3, 0(t4)
    addiu       s2, -1
    bgtz        s2, 4b
     addiu      t4,  2
5:
    beqz        t9, 6f
     nop
    sb          t2, 0(t4)
6:
    addiu       s1, 4
    addiu       a2, -1
    bnez        a2, 0b
     addiu      s0, 4
7:
    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4

    j           ra
    nop
END(jsimd_h2v1_downsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
/*
 * a0     = cinfo->image_width
 * a1     = cinfo->max_v_samp_factor
 * a2     = compptr->v_samp_factor
 * a3     = compptr->width_in_blocks
 * 16(sp) = input_data
 * 20(sp) = output_data
 */
    .set at

    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    beqz        a2, 8f
     lw         s1, 52(sp)      // s1 = output_data
    lw          s0, 48(sp)      // s0 = input_data

    andi        t6, a0, 1       // t6 = temp_index
    addiu       t6, -1
    srl         t7, a0, 1       // t7 = image_width1
    srl         s4, t7, 2
    andi        t8, t7, 3
    andi        t9, a0, 2
    srl         s2, a0, 2
    srl         t7, t9, 1
    addu        s2, t7, s2
    sll         t0, a3, 3       // s2 = width_in_blocks*DCT
    srl         t7, t0, 1
    subu        s2, t7, s2
0:
    lw          t4, 0(s1)       // t4 = outptr
    lw          t5, 0(s0)       // t5 = inptr0
    lw          s7, 4(s0)       // s7 = inptr1
    li          s6, 1           // s6 = bias
2:
    ulw         t0, 0(t5)       // t0 = |P3|P2|P1|P0|
    ulw         t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
    ulw         t2, 4(t5)
    ulw         t3, 4(s7)
    precrq.ph.w t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
    ins         t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
    raddu.w.qb  t1, t7
    raddu.w.qb  t0, t0
    shra_r.w    t1, t1, 2
    addiu       t0, 1
    srl         t0, 2
    precrq.ph.w t7, t2, t3
    ins         t2, t3, 16, 16
    raddu.w.qb  t7, t7
    raddu.w.qb  t2, t2
    shra_r.w    t7, t7, 2
    addiu       t2, 1
    srl         t2, 2
    sb          t0, 0(t4)
    sb          t1, 1(t4)
    sb          t2, 2(t4)
    sb          t7, 3(t4)
    addiu       t4, 4
    addiu       t5, 8
    addiu       s4, s4, -1
    bgtz        s4, 2b
     addiu      s7, 8
    beqz        t8, 4f
     addu       t8, t4, t8
3:
    ulhu        t0, 0(t5)
    ulhu        t1, 0(s7)
    ins         t0, t1, 16, 16
    raddu.w.qb  t0, t0
    addu        t0, t0, s6
    srl         t0, 2
    xori        s6, s6, 3
    sb          t0, 0(t4)
    addiu       t5, 2
    addiu       t4, 1
    bne         t8, t4, 3b
     addiu      s7, 2
4:
    lbux        t1, t6(t5)
    sll         t1, 1
    lbux        t0, t6(s7)
    sll         t0, 1
    addu        t1, t1, t0
    addu        t3, t1, s6
    srl         t0, t3, 2       // t2 = pixval1
    xori        s6, s6, 3
    addu        t2, t1, s6
    srl         t1, t2, 2       // t3 = pixval2
    blez        s2, 6f
     append     t1, t0, 8
5:
    ush         t1, 0(t4)
    addiu       s2, -1
    bgtz        s2, 5b
     addiu      t4, 2
6:
    beqz        t9, 7f
     nop
    sb          t0, 0(t4)
7:
    addiu       s1, 4
    addiu       a2, -1
    bnez        a2, 0b
     addiu      s0, 8
8:
    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop
END(jsimd_h2v2_downsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
/*
 * a0     = input_data
 * a1     = output_data
 * a2     = compptr->v_samp_factor
 * a3     = cinfo->max_v_samp_factor
 * 16(sp) = cinfo->smoothing_factor
 * 20(sp) = compptr->width_in_blocks
 * 24(sp) = cinfo->image_width
 */
    .set at

    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    lw          s7, 52(sp)      // compptr->width_in_blocks
    lw          s0, 56(sp)      // cinfo->image_width
    lw          s6, 48(sp)      // cinfo->smoothing_factor
    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
    sll         v0, s7, 1
    subu        v0, v0, s0
    blez        v0, 2f
    move        v1, zero
    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
0:
    addiu       t1, a0, -4
    sll         t2, v1, 2
    lwx         t1, t2(t1)
    move        t3, v0
    addu        t1, t1, s0
    lbu         t2, -1(t1)
1:
    addiu       t3, t3, -1
    sb          t2, 0(t1)
    bgtz        t3, 1b
    addiu       t1, t1, 1
    addiu       v1, v1, 1
    bne         v1, t0, 0b
    nop
2:
    li          v0, 80
    mul         v0, s6, v0
    li          v1, 16384
    move        t4, zero
    move        t5, zero
    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
3:
/* Special case for first column: pretend column -1 is same as column 0 */
    sll         v0, t4, 2
    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
    sll         v1, t5, 2
    addiu       t9, v1, 4
    addiu       s0, v1, -4
    addiu       s1, v1, 8
    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
    lh          v0, 0(s2)
    lh          v1, 0(t9)
    lh          t0, 0(s0)
    lh          t1, 0(s1)
    ins         v0, v1, 16, 16
    ins         t0, t1, 16, 16
    raddu.w.qb  t2, v0
    raddu.w.qb  s3, t0
    lbu         v0, 0(s2)
    lbu         v1, 2(s2)
    lbu         t0, 0(t9)
    lbu         t1, 2(t9)
    addu        v0, v0, v1
    mult        $ac1, t2, t6
    addu        t0, t0, t1
    lbu         t2, 2(s0)
    addu        t0, t0, v0
    lbu         t3, 2(s1)
    addu        s3, t0, s3
    lbu         v0, 0(s0)
    lbu         t0, 0(s1)
    sll         s3, s3, 1
    addu        v0, v0, t2
    addu        t0, t0, t3
    addu        t0, t0, v0
    addu        s3, t0, s3
    madd        $ac1, s3, t7
    extr_r.w    v0, $ac1, 16
    addiu       t8, t8, 1
    addiu       s2, s2, 2
    addiu       t9, t9, 2
    addiu       s0, s0, 2
    addiu       s1, s1, 2
    sb          v0, -1(t8)
    addiu       s4, s7, -2
    and         s4, s4, 3
    addu        s5, s4, t8      // end address
4:
    lh          v0, 0(s2)
    lh          v1, 0(t9)
    lh          t0, 0(s0)
    lh          t1, 0(s1)
    ins         v0, v1, 16, 16
    ins         t0, t1, 16, 16
    raddu.w.qb  t2, v0
    raddu.w.qb  s3, t0
    lbu         v0, -1(s2)
    lbu         v1, 2(s2)
    lbu         t0, -1(t9)
    lbu         t1, 2(t9)
    addu        v0, v0, v1
    mult        $ac1, t2, t6
    addu        t0, t0, t1
    lbu         t2, 2(s0)
    addu        t0, t0, v0
    lbu         t3, 2(s1)
    addu        s3, t0, s3
    lbu         v0, -1(s0)
    lbu         t0, -1(s1)
    sll         s3, s3, 1
    addu        v0, v0, t2
    addu        t0, t0, t3
    addu        t0, t0, v0
    addu        s3, t0, s3
    madd        $ac1, s3, t7
    extr_r.w    t2, $ac1, 16
    addiu       t8, t8, 1
    addiu       s2, s2, 2
    addiu       t9, t9, 2
    addiu       s0, s0, 2
    sb          t2, -1(t8)
    bne         s5, t8, 4b
    addiu       s1, s1, 2
    addiu       s5, s7, -2
    subu        s5, s5, s4
    addu        s5, s5, t8      // end address
5:
    lh          v0, 0(s2)
    lh          v1, 0(t9)
    lh          t0, 0(s0)
    lh          t1, 0(s1)
    ins         v0, v1, 16, 16
    ins         t0, t1, 16, 16
    raddu.w.qb  t2, v0
    raddu.w.qb  s3, t0
    lbu         v0, -1(s2)
    lbu         v1, 2(s2)
    lbu         t0, -1(t9)
    lbu         t1, 2(t9)
    addu        v0, v0, v1
    mult        $ac1, t2, t6
    addu        t0, t0, t1
    lbu         t2, 2(s0)
    addu        t0, t0, v0
    lbu         t3, 2(s1)
    addu        s3, t0, s3
    lbu         v0, -1(s0)
    lbu         t0, -1(s1)
    sll         s3, s3, 1
    addu        v0, v0, t2
    addu        t0, t0, t3
    lh          v1, 2(t9)
    addu        t0, t0, v0
    lh          v0, 2(s2)
    addu        s3, t0, s3
    lh          t0, 2(s0)
    lh          t1, 2(s1)
    madd        $ac1, s3, t7
    extr_r.w    t2, $ac1, 16
    ins         t0, t1, 16, 16
    ins         v0, v1, 16, 16
    raddu.w.qb  s3, t0
    lbu         v1, 4(s2)
    lbu         t0, 1(t9)
    lbu         t1, 4(t9)
    sb          t2, 0(t8)
    raddu.w.qb  t3, v0
    lbu         v0, 1(s2)
    addu        t0, t0, t1
    mult        $ac1, t3, t6
    addu        v0, v0, v1
    lbu         t2, 4(s0)
    addu        t0, t0, v0
    lbu         v0, 1(s0)
    addu        s3, t0, s3
    lbu         t0, 1(s1)
    lbu         t3, 4(s1)
    addu        v0, v0, t2
    sll         s3, s3, 1
    addu        t0, t0, t3
    lh          v1, 4(t9)
    addu        t0, t0, v0
    lh          v0, 4(s2)
    addu        s3, t0, s3
    lh          t0, 4(s0)
    lh          t1, 4(s1)
    madd        $ac1, s3, t7
    extr_r.w    t2, $ac1, 16
    ins         t0, t1, 16, 16
    ins         v0, v1, 16, 16
    raddu.w.qb  s3, t0
    lbu         v1, 6(s2)
    lbu         t0, 3(t9)
    lbu         t1, 6(t9)
    sb          t2, 1(t8)
    raddu.w.qb  t3, v0
    lbu         v0, 3(s2)
    addu        t0, t0, t1
    mult        $ac1, t3, t6
    addu        v0, v0, v1
    lbu         t2, 6(s0)
    addu        t0, t0, v0
    lbu         v0, 3(s0)
    addu        s3, t0, s3
    lbu         t0, 3(s1)
    lbu         t3, 6(s1)
    addu        v0, v0, t2
    sll         s3, s3, 1
    addu        t0, t0, t3
    lh          v1, 6(t9)
    addu        t0, t0, v0
    lh          v0, 6(s2)
    addu        s3, t0, s3
    lh          t0, 6(s0)
    lh          t1, 6(s1)
    madd        $ac1, s3, t7
    extr_r.w    t3, $ac1, 16
    ins         t0, t1, 16, 16
    ins         v0, v1, 16, 16
    raddu.w.qb  s3, t0
    lbu         v1, 8(s2)
    lbu         t0, 5(t9)
    lbu         t1, 8(t9)
    sb          t3, 2(t8)
    raddu.w.qb  t2, v0
    lbu         v0, 5(s2)
    addu        t0, t0, t1
    mult        $ac1, t2, t6
    addu        v0, v0, v1
    lbu         t2, 8(s0)
    addu        t0, t0, v0
    lbu         v0, 5(s0)
    addu        s3, t0, s3
    lbu         t0, 5(s1)
    lbu         t3, 8(s1)
    addu        v0, v0, t2
    sll         s3, s3, 1
    addu        t0, t0, t3
    addiu       t8, t8, 4
    addu        t0, t0, v0
    addiu       s2, s2, 8
    addu        s3, t0, s3
    addiu       t9, t9, 8
    madd        $ac1, s3, t7
    extr_r.w    t1, $ac1, 16
    addiu       s0, s0, 8
    addiu       s1, s1, 8
    bne         s5, t8, 5b
    sb          t1, -1(t8)
/* Special case for last column */
    lh          v0, 0(s2)
    lh          v1, 0(t9)
    lh          t0, 0(s0)
    lh          t1, 0(s1)
    ins         v0, v1, 16, 16
    ins         t0, t1, 16, 16
    raddu.w.qb  t2, v0
    raddu.w.qb  s3, t0
    lbu         v0, -1(s2)
    lbu         v1, 1(s2)
    lbu         t0, -1(t9)
    lbu         t1, 1(t9)
    addu        v0, v0, v1
    mult        $ac1, t2, t6
    addu        t0, t0, t1
    lbu         t2, 1(s0)
    addu        t0, t0, v0
    lbu         t3, 1(s1)
    addu        s3, t0, s3
    lbu         v0, -1(s0)
    lbu         t0, -1(s1)
    sll         s3, s3, 1
    addu        v0, v0, t2
    addu        t0, t0, t3
    addu        t0, t0, v0
    addu        s3, t0, s3
    madd        $ac1, s3, t7
    extr_r.w    t0, $ac1, 16
    addiu       t5, t5, 2
    sb          t0, 0(t8)
    addiu       t4, t4, 1
    bne         t4, a2, 3b
    addiu       t5, t5, 2

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop

END(jsimd_h2v2_smooth_downsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_int_upsample_dspr2)
/*
 * a0     = upsample->h_expand[compptr->component_index]
 * a1     = upsample->v_expand[compptr->component_index]
 * a2     = input_data
 * a3     = output_data_ptr
 * 16(sp) = cinfo->output_width
 * 20(sp) = cinfo->max_v_samp_factor
 */
    .set at

    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

    lw          s0, 0(a3)       // s0 = output_data
    lw          s1, 32(sp)      // s1 = cinfo->output_width
    lw          s2, 36(sp)      // s2 = cinfo->max_v_samp_factor
    li          t6, 0           // t6 = inrow
    beqz        s2, 10f
     li         s3, 0           // s3 = outrow
0:
    addu        t0, a2, t6
    addu        t7, s0, s3
    lw          t3, 0(t0)       // t3 = inptr
    lw          t8, 0(t7)       // t8 = outptr
    beqz        s1, 4f
     addu       t5, t8, s1      // t5 = outend
1:
    lb          t2, 0(t3)       // t2 = invalue = *inptr++
    addiu       t3, 1
    beqz        a0, 3f
     move       t0, a0          // t0 = h_expand
2:
    sb          t2, 0(t8)
    addiu       t0, -1
    bgtz        t0, 2b
     addiu      t8, 1
3:
    bgt         t5, t8, 1b
     nop
4:
    addiu       t9, a1, -1      // t9 = v_expand - 1
    blez        t9, 9f
     nop
5:
    lw          t3, 0(s0)
    lw          t4, 4(s0)
    subu        t0, s1, 0xF
    blez        t0, 7f
     addu       t5, t3, s1      // t5 = end address
    andi        t7, s1, 0xF     // t7 = residual
    subu        t8, t5, t7
6:
    ulw         t0, 0(t3)
    ulw         t1, 4(t3)
    ulw         t2, 8(t3)
    usw         t0, 0(t4)
    ulw         t0, 12(t3)
    usw         t1, 4(t4)
    usw         t2, 8(t4)
    usw         t0, 12(t4)
    addiu       t3, 16
    bne         t3, t8, 6b
     addiu      t4, 16
    beqz        t7, 8f
     nop
7:
    lbu         t0, 0(t3)
    sb          t0, 0(t4)
    addiu       t3, 1
    bne         t3, t5, 7b
     addiu      t4, 1
8:
    addiu       t9, -1
    bgtz        t9, 5b
     addiu      s0, 8
9:
    addu        s3, s3, a1
    bne         s3, s2, 0b
     addiu      t6, 1
10:
    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

    j           ra
     nop
END(jsimd_int_upsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
/*
 * a0 = cinfo->max_v_samp_factor
 * a1 = cinfo->output_width
 * a2 = input_data
 * a3 = output_data_ptr
 */
    lw          t7, 0(a3)       // t7 = output_data
    andi        t8, a1, 0xf     // t8 = residual
    sll         t0, a0, 2
    blez        a0, 4f
     addu       t9, t7, t0      // t9 = output_data end address
0:
    lw          t5, 0(t7)       // t5 = outptr
    lw          t6, 0(a2)       // t6 = inptr
    addu        t3, t5, a1      // t3 = outptr + output_width (end address)
    subu        t3, t8          // t3 = end address - residual
    beq         t5, t3, 2f
     move       t4, t8
1:
    ulw         t0, 0(t6)       // t0 = |P3|P2|P1|P0|
    ulw         t2, 4(t6)       // t2 = |P7|P6|P5|P4|
    srl         t1, t0, 16      // t1 = |X|X|P3|P2|
    ins         t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
    ins         t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
    ins         t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
    ins         t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
    usw         t0, 0(t5)
    usw         t1, 4(t5)
    srl         t0, t2, 16      // t0 = |X|X|P7|P6|
    ins         t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
    ins         t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
    ins         t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
    ins         t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
    usw         t2, 8(t5)
    usw         t0, 12(t5)
    addiu       t5, 16
    bne         t5, t3, 1b
     addiu      t6, 8
    beqz        t8, 3f
     move       t4, t8
2:
    lbu         t1, 0(t6)
    sb          t1, 0(t5)
    sb          t1, 1(t5)
    addiu       t4, -2
    addiu       t6, 1
    bgtz        t4, 2b
     addiu      t5, 2
3:
    addiu       t7, 4
    bne         t9, t7, 0b
     addiu      a2, 4
4:
    j           ra
     nop
END(jsimd_h2v1_upsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
/*
 * a0 = cinfo->max_v_samp_factor
 * a1 = cinfo->output_width
 * a2 = input_data
 * a3 = output_data_ptr
 */
    lw          t7, 0(a3)
    blez        a0, 7f
     andi       t9, a1, 0xf     // t9 = residual
0:
    lw          t6, 0(a2)       // t6 = inptr
    lw          t5, 0(t7)       // t5 = outptr
    addu        t8, t5, a1      // t8 = outptr end address
    subu        t8, t9          // t8 = end address - residual
    beq         t5, t8, 2f
     move       t4, t9
1:
    ulw         t0, 0(t6)
    srl         t1, t0, 16
    ins         t0, t0, 16, 16
    ins         t0, t0, 8, 16
    ins         t1, t1, 16, 16
    ins         t1, t1, 8, 16
    ulw         t2, 4(t6)
    usw         t0, 0(t5)
    usw         t1, 4(t5)
    srl         t3, t2, 16
    ins         t2, t2, 16, 16
    ins         t2, t2, 8, 16
    ins         t3, t3, 16, 16
    ins         t3, t3, 8, 16
    usw         t2, 8(t5)
    usw         t3, 12(t5)
    addiu       t5, 16
    bne         t5, t8, 1b
     addiu      t6, 8
    beqz        t9, 3f
     move       t4, t9
2:
    lbu         t0, 0(t6)
    sb          t0, 0(t5)
    sb          t0, 1(t5)
    addiu       t4, -2
    addiu       t6, 1
    bgtz        t4, 2b
     addiu      t5, 2
3:
    lw          t6, 0(t7)       // t6 = outptr[0]
    lw          t5, 4(t7)       // t5 = outptr[1]
    addu        t4, t6, a1      // t4 = new end address
    beq         a1, t9, 5f
     subu       t8, t4, t9
4:
    ulw         t0, 0(t6)
    ulw         t1, 4(t6)
    ulw         t2, 8(t6)
    usw         t0, 0(t5)
    ulw         t0, 12(t6)
    usw         t1, 4(t5)
    usw         t2, 8(t5)
    usw         t0, 12(t5)
    addiu       t6, 16
    bne         t6, t8, 4b
     addiu      t5, 16
    beqz        t9, 6f
     nop
5:
    lbu         t0, 0(t6)
    sb          t0, 0(t5)
    addiu       t6, 1
    bne         t6, t4, 5b
     addiu      t5, 1
6:
    addiu       t7, 8
    addiu       a0, -2
    bgtz        a0, 0b
     addiu      a2, 4
7:
    j           ra
     nop
END(jsimd_h2v2_upsample_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_islow_dspr2)
/*
 * a0 = coef_block
 * a1 = compptr->dcttable
 * a2 = output
 * a3 = range_limit
 */
    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    addiu       sp, sp, -256
    move        v0, sp
    addiu       v1, zero, 8     // v1 = DCTSIZE = 8
1:
    lh          s4, 32(a0)      // s4 = inptr[16]
    lh          s5, 64(a0)      // s5 = inptr[32]
    lh          s6, 96(a0)      // s6 = inptr[48]
    lh          t1, 112(a0)     // t1 = inptr[56]
    lh          t7, 16(a0)      // t7 = inptr[8]
    lh          t5, 80(a0)      // t5 = inptr[40]
    lh          t3, 48(a0)      // t3 = inptr[24]
    or          s4, s4, t1
    or          s4, s4, t3
    or          s4, s4, t5
    or          s4, s4, t7
    or          s4, s4, s5
    or          s4, s4, s6
    bnez        s4, 2f
     addiu      v1, v1, -1
    lh          s5, 0(a1)       // quantptr[DCTSIZE*0]
    lh          s6, 0(a0)       // inptr[DCTSIZE*0]
    mul         s5, s5, s6      // DEQUANTIZE(inptr[0], quantptr[0])
    sll         s5, s5, 2
    sw          s5, 0(v0)
    sw          s5, 32(v0)
    sw          s5, 64(v0)
    sw          s5, 96(v0)
    sw          s5, 128(v0)
    sw          s5, 160(v0)
    sw          s5, 192(v0)
    b           3f
     sw         s5, 224(v0)
2:
    lh          t0, 112(a1)
    lh          t2, 48(a1)
    lh          t4, 80(a1)
    lh          t6, 16(a1)
    mul         t0, t0, t1      // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
    mul         t1, t2, t3      // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
    mul         t2, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
    mul         t3, t6, t7      // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
    lh          t4, 32(a1)
    lh          t5, 32(a0)
    lh          t6, 96(a1)
    lh          t7, 96(a0)
    addu        s0, t0, t1       // z3 = tmp0 + tmp2
    addu        s1, t1, t2       // z2 = tmp1 + tmp2
    addu        s2, t2, t3       // z4 = tmp1 + tmp3
    addu        s3, s0, s2       // z3 + z4
    addiu       t9, zero, 9633   // FIX_1_175875602
    mul         s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
    addu        t8, t0, t3       // z1 = tmp0 + tmp3
    addiu       t9, zero, 2446   // FIX_0_298631336
    mul         t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
    addiu       t9, zero, 16819  // FIX_2_053119869
    mul         t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
    addiu       t9, zero, 25172  // FIX_3_072711026
    mul         t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
    addiu       t9, zero, 12299  // FIX_1_501321110
    mul         t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
    addiu       t9, zero, 16069  // FIX_1_961570560
    mul         s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
    addiu       t9, zero, 3196   // FIX_0_390180644
    mul         s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
    addiu       t9, zero, 7373   // FIX_0_899976223
    mul         t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
    addiu       t9, zero, 20995  // FIX_2_562915447
    mul         s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
    subu        s0, s3, s0       // z3 += z5
    addu        t0, t0, s0       // tmp0 += z3
    addu        t1, t1, s0       // tmp2 += z3
    subu        s2, s3, s2       // z4 += z5
    addu        t2, t2, s2       // tmp1 += z4
    addu        t3, t3, s2       // tmp3 += z4
    subu        t0, t0, t8       // tmp0 += z1
    subu        t1, t1, s1       // tmp2 += z2
    subu        t2, t2, s1       // tmp1 += z2
    subu        t3, t3, t8       // tmp3 += z1
    mul         s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
    addiu       t9, zero, 6270   // FIX_0_765366865
    mul         s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
    lh          t4, 0(a1)
    lh          t5, 0(a0)
    lh          t6, 64(a1)
    lh          t7, 64(a0)
    mul         s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
    mul         t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
    mul         t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
    addiu       t9, zero, 4433   // FIX_0_541196100
    addu        s3, s0, s1       // z2 + z3
    mul         s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
    addiu       t9, zero, 15137  // FIX_1_847759065
    mul         t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
    addu        t4, t5, t6
    subu        t5, t5, t6
    sll         t4, t4, 13      // tmp0 = (z2 + z3) << CONST_BITS
    sll         t5, t5, 13      // tmp1 = (z2 - z3) << CONST_BITS
    addu        t7, s3, s2      // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
    subu        t6, s3, t8      // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
    addu        s0, t4, t7
    subu        s1, t4, t7
    addu        s2, t5, t6
    subu        s3, t5, t6
    addu        t4, s0, t3
    subu        s0, s0, t3
    addu        t3, s2, t1
    subu        s2, s2, t1
    addu        t1, s3, t2
    subu        s3, s3, t2
    addu        t2, s1, t0
    subu        s1, s1, t0
    shra_r.w    t4, t4, 11
    shra_r.w    t3, t3, 11
    shra_r.w    t1, t1, 11
    shra_r.w    t2, t2, 11
    shra_r.w    s1, s1, 11
    shra_r.w    s3, s3, 11
    shra_r.w    s2, s2, 11
    shra_r.w    s0, s0, 11
    sw          t4, 0(v0)
    sw          t3, 32(v0)
    sw          t1, 64(v0)
    sw          t2, 96(v0)
    sw          s1, 128(v0)
    sw          s3, 160(v0)
    sw          s2, 192(v0)
    sw          s0, 224(v0)
3:
    addiu       a1, a1, 2
    addiu       a0, a0, 2
    bgtz        v1, 1b
     addiu      v0, v0, 4
    move        v0, sp
    addiu       v1, zero, 8
4:
    lw          t0, 8(v0)       // z2 = (JLONG)wsptr[2]
    lw          t1, 24(v0)      // z3 = (JLONG)wsptr[6]
    lw          t2, 0(v0)       // (JLONG)wsptr[0]
    lw          t3, 16(v0)      // (JLONG)wsptr[4]
    lw          s4, 4(v0)       // (JLONG)wsptr[1]
    lw          s5, 12(v0)      // (JLONG)wsptr[3]
    lw          s6, 20(v0)      // (JLONG)wsptr[5]
    lw          s7, 28(v0)      // (JLONG)wsptr[7]
    or          s4, s4, t0
    or          s4, s4, t1
    or          s4, s4, t3
    or          s4, s4, s7
    or          s4, s4, s5
    or          s4, s4, s6
    bnez        s4, 5f
     addiu      v1, v1, -1
    shra_r.w    s5, t2, 5
    andi        s5, s5, 0x3ff
    lbux        s5, s5(a3)
    lw          s1, 0(a2)
    replv.qb    s5, s5
    usw         s5, 0(s1)
    usw         s5, 4(s1)
    b           6f
     nop
5:
    addu        t4, t0, t1       // z2 + z3
    addiu       t8, zero, 4433   // FIX_0_541196100
    mul         t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
    addiu       t8, zero, 15137  // FIX_1_847759065
    mul         t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
    addiu       t8, zero, 6270   // FIX_0_765366865
    mul         t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
    addu        t4, t2, t3       // (JLONG)wsptr[0] + (JLONG)wsptr[4]
    subu        t2, t2, t3       // (JLONG)wsptr[0] - (JLONG)wsptr[4]
    sll         t4, t4, 13       // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
    sll         t2, t2, 13       // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
    subu        t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
    subu        t3, t2, t1       // tmp12 = tmp1 - tmp2
    addu        t2, t2, t1       // tmp11 = tmp1 + tmp2
    addu        t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
    subu        t1, t4, t5       // tmp13 = tmp0 - tmp3
    addu        t0, t4, t5       // tmp10 = tmp0 + tmp3
    lw          t4, 28(v0)       // tmp0 = (JLONG)wsptr[7]
    lw          t6, 12(v0)       // tmp2 = (JLONG)wsptr[3]
    lw          t5, 20(v0)       // tmp1 = (JLONG)wsptr[5]
    lw          t7, 4(v0)        // tmp3 = (JLONG)wsptr[1]
    addu        s0, t4, t6       // z3 = tmp0 + tmp2
    addiu       t8, zero, 9633   // FIX_1_175875602
    addu        s1, t5, t7       // z4 = tmp1 + tmp3
    addu        s2, s0, s1       // z3 + z4
    mul         s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
    addu        s3, t4, t7       // z1 = tmp0 + tmp3
    addu        t9, t5, t6       // z2 = tmp1 + tmp2
    addiu       t8, zero, 16069  // FIX_1_961570560
    mul         s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
    addiu       t8, zero, 3196   // FIX_0_390180644
    mul         s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
    addiu       t8, zero, 2446   // FIX_0_298631336
    mul         t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
    addiu       t8, zero, 7373   // FIX_0_899976223
    mul         s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
    addiu       t8, zero, 16819  // FIX_2_053119869
    mul         t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
    addiu       t8, zero, 20995  // FIX_2_562915447
    mul         t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
    addiu       t8, zero, 25172  // FIX_3_072711026
    mul         t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
    addiu       t8, zero, 12299  // FIX_1_501321110
    mul         t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
    subu        s0, s2, s0       // z3 += z5
    subu        s1, s2, s1       // z4 += z5
    addu        t4, t4, s0
    subu        t4, t4, s3      // tmp0
    addu        t5, t5, s1
    subu        t5, t5, t9      // tmp1
    addu        t6, t6, s0
    subu        t6, t6, t9      // tmp2
    addu        t7, t7, s1
    subu        t7, t7, s3      // tmp3
    addu        s0, t0, t7
    subu        t0, t0, t7
    addu        t7, t2, t6
    subu        t2, t2, t6
    addu        t6, t3, t5
    subu        t3, t3, t5
    addu        t5, t1, t4
    subu        t1, t1, t4
    shra_r.w    s0, s0, 18
    shra_r.w    t7, t7, 18
    shra_r.w    t6, t6, 18
    shra_r.w    t5, t5, 18
    shra_r.w    t1, t1, 18
    shra_r.w    t3, t3, 18
    shra_r.w    t2, t2, 18
    shra_r.w    t0, t0, 18
    andi        s0, s0, 0x3ff
    andi        t7, t7, 0x3ff
    andi        t6, t6, 0x3ff
    andi        t5, t5, 0x3ff
    andi        t1, t1, 0x3ff
    andi        t3, t3, 0x3ff
    andi        t2, t2, 0x3ff
    andi        t0, t0, 0x3ff
    lw          s1, 0(a2)
    lbux        s0, s0(a3)
    lbux        t7, t7(a3)
    lbux        t6, t6(a3)
    lbux        t5, t5(a3)
    lbux        t1, t1(a3)
    lbux        t3, t3(a3)
    lbux        t2, t2(a3)
    lbux        t0, t0(a3)
    sb          s0, 0(s1)
    sb          t7, 1(s1)
    sb          t6, 2(s1)
    sb          t5, 3(s1)
    sb          t1, 4(s1)
    sb          t3, 5(s1)
    sb          t2, 6(s1)
    sb          t0, 7(s1)
6:
    addiu       v0, v0, 32
    bgtz        v1, 4b
     addiu      a2, a2, 4
    addiu       sp, sp, 256

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop

END(jsimd_idct_islow_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
/*
 * a0 = inptr
 * a1 = quantptr
 * a2 = wsptr
 * a3 = mips_idct_ifast_coefs
 */
    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    addiu         t9, a0, 16      // end address
    or            AT, a3, zero

0:
    lw            s0, 0(a1)       // quantptr[DCTSIZE*0]
    lw            t0, 0(a0)       // inptr[DCTSIZE*0]
    lw            t1, 16(a0)      // inptr[DCTSIZE*1]
    muleq_s.w.phl v0, t0, s0      // tmp0 ...
    lw            t2, 32(a0)      // inptr[DCTSIZE*2]
    lw            t3, 48(a0)      // inptr[DCTSIZE*3]
    lw            t4, 64(a0)      // inptr[DCTSIZE*4]
    lw            t5, 80(a0)      // inptr[DCTSIZE*5]
    muleq_s.w.phr t0, t0, s0      // ... tmp0 ...
    lw            t6, 96(a0)      // inptr[DCTSIZE*6]
    lw            t7, 112(a0)     // inptr[DCTSIZE*7]
    or            s4, t1, t2
    or            s5, t3, t4
    bnez          s4, 1f
     ins          t0, v0, 16, 16  // ... tmp0
    bnez          s5, 1f
     or           s6, t5, t6
    or            s6, s6, t7
    bnez          s6, 1f
     sw           t0, 0(a2)       // wsptr[DCTSIZE*0]
    sw            t0, 16(a2)      // wsptr[DCTSIZE*1]
    sw            t0, 32(a2)      // wsptr[DCTSIZE*2]
    sw            t0, 48(a2)      // wsptr[DCTSIZE*3]
    sw            t0, 64(a2)      // wsptr[DCTSIZE*4]
    sw            t0, 80(a2)      // wsptr[DCTSIZE*5]
    sw            t0, 96(a2)      // wsptr[DCTSIZE*6]
    sw            t0, 112(a2)     // wsptr[DCTSIZE*7]
    addiu         a0, a0, 4
    b             2f
     addiu        a1, a1, 4

1:
    lw            s1, 32(a1)      // quantptr[DCTSIZE*2]
    lw            s2, 64(a1)      // quantptr[DCTSIZE*4]
    muleq_s.w.phl v0, t2, s1      // tmp1 ...
    muleq_s.w.phr t2, t2, s1      // ... tmp1 ...
    lw            s0, 16(a1)      // quantptr[DCTSIZE*1]
    lw            s1, 48(a1)      // quantptr[DCTSIZE*3]
    lw            s3, 96(a1)      // quantptr[DCTSIZE*6]
    muleq_s.w.phl v1, t4, s2      // tmp2 ...
    muleq_s.w.phr t4, t4, s2      // ... tmp2 ...
    lw            s2, 80(a1)      // quantptr[DCTSIZE*5]
    lw            t8, 4(AT)       // FIX(1.414213562)
    ins           t2, v0, 16, 16  // ... tmp1
    muleq_s.w.phl v0, t6, s3      // tmp3 ...
    muleq_s.w.phr t6, t6, s3      // ... tmp3 ...
    ins           t4, v1, 16, 16  // ... tmp2
    addq.ph       s4, t0, t4      // tmp10
    subq.ph       s5, t0, t4      // tmp11
    ins           t6, v0, 16, 16  // ... tmp3
    subq.ph       s6, t2, t6      // tmp12 ...
    addq.ph       s7, t2, t6      // tmp13
    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
    addq.ph       t0, s4, s7      // tmp0
    subq.ph       t6, s4, s7      // tmp3
    muleq_s.w.phl v0, t1, s0      // tmp4 ...
    muleq_s.w.phr t1, t1, s0      // ... tmp4 ...
    shll_s.ph     s6, s6, 1       // x2
    lw            s3, 112(a1)     // quantptr[DCTSIZE*7]
    subq.ph       s6, s6, s7      // ... tmp12
    muleq_s.w.phl v1, t7, s3      // tmp7 ...
    muleq_s.w.phr t7, t7, s3      // ... tmp7 ...
    ins           t1, v0, 16, 16  // ... tmp4
    addq.ph       t2, s5, s6      // tmp1
    subq.ph       t4, s5, s6      // tmp2
    muleq_s.w.phl v0, t5, s2      // tmp6 ...
    muleq_s.w.phr t5, t5, s2      // ... tmp6 ...
    ins           t7, v1, 16, 16  // ... tmp7
    addq.ph       s5, t1, t7      // z11
    subq.ph       s6, t1, t7      // z12
    muleq_s.w.phl v1, t3, s1      // tmp5 ...
    muleq_s.w.phr t3, t3, s1      // ... tmp5 ...
    ins           t5, v0, 16, 16  // ... tmp6
    ins           t3, v1, 16, 16  // ... tmp5
    addq.ph       s7, t5, t3      // z13
    subq.ph       v0, t5, t3      // z10
    addq.ph       t7, s5, s7      // tmp7
    subq.ph       s5, s5, s7      // tmp11 ...
    addq.ph       v1, v0, s6      // z5 ...
    mulq_s.ph     s5, s5, t8      // ... tmp11
    lw            t8, 8(AT)       // FIX(1.847759065)
    lw            s4, 0(AT)       // FIX(1.082392200)
    addq.ph       s0, t0, t7
    subq.ph       s1, t0, t7
    mulq_s.ph     v1, v1, t8      // ... z5
    shll_s.ph     s5, s5, 1       // x2
    lw            t8, 12(AT)      // FIX(-2.613125930)
    sw            s0, 0(a2)       // wsptr[DCTSIZE*0]
    shll_s.ph     v0, v0, 1       // x4
    mulq_s.ph     v0, v0, t8      // tmp12 ...
    mulq_s.ph     s4, s6, s4      // tmp10 ...
    shll_s.ph     v1, v1, 1       // x2
    addiu         a0, a0, 4
    addiu         a1, a1, 4
    sw            s1, 112(a2)     // wsptr[DCTSIZE*7]
    shll_s.ph     s6, v0, 1       // x4
    shll_s.ph     s4, s4, 1       // x2
    addq.ph       s6, s6, v1      // ... tmp12
    subq.ph       t5, s6, t7      // tmp6
    subq.ph       s4, s4, v1      // ... tmp10
    subq.ph       t3, s5, t5      // tmp5
    addq.ph       s2, t2, t5
    addq.ph       t1, s4, t3      // tmp4
    subq.ph       s3, t2, t5
    sw            s2, 16(a2)      // wsptr[DCTSIZE*1]
    sw            s3, 96(a2)      // wsptr[DCTSIZE*6]
    addq.ph       v0, t4, t3
    subq.ph       v1, t4, t3
    sw            v0, 32(a2)      // wsptr[DCTSIZE*2]
    sw            v1, 80(a2)      // wsptr[DCTSIZE*5]
    addq.ph       v0, t6, t1
    subq.ph       v1, t6, t1
    sw            v0, 64(a2)      // wsptr[DCTSIZE*4]
    sw            v1, 48(a2)      // wsptr[DCTSIZE*3]

2:
    bne           a0, t9, 0b
     addiu        a2, a2, 4

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j             ra
     nop

END(jsimd_idct_ifast_cols_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
/*
 * a0 = wsptr
 * a1 = output_buf
 * a2 = output_col
 * a3 = mips_idct_ifast_coefs
 */
    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3

    addiu         t9, a0, 128     // end address
    lui           s8, 0x8080
    ori           s8, s8, 0x8080

0:
    lw            AT, 36(sp)      // restore $a3 (mips_idct_ifast_coefs)
    lw            t0, 0(a0)       // wsptr[DCTSIZE*0+0/1]  b a
    lw            s0, 16(a0)      // wsptr[DCTSIZE*1+0/1]  B A
    lw            t2, 4(a0)       // wsptr[DCTSIZE*0+2/3]  d c
    lw            s2, 20(a0)      // wsptr[DCTSIZE*1+2/3]  D C
    lw            t4, 8(a0)       // wsptr[DCTSIZE*0+4/5]  f e
    lw            s4, 24(a0)      // wsptr[DCTSIZE*1+4/5]  F E
    lw            t6, 12(a0)      // wsptr[DCTSIZE*0+6/7]  h g
    lw            s6, 28(a0)      // wsptr[DCTSIZE*1+6/7]  H G
    precrq.ph.w   t1, s0, t0      // B b
    ins           t0, s0, 16, 16  // A a
    bnez          t1, 1f
     or           s0, t2, s2
    bnez          s0, 1f
     or           s0, t4, s4
    bnez          s0, 1f
     or           s0, t6, s6
    bnez          s0, 1f
     shll_s.ph    s0, t0, 2       // A a
    lw            a3, 0(a1)
    lw            AT, 4(a1)
    precrq.ph.w   t0, s0, s0      // A A
    ins           s0, s0, 16, 16  // a a
    addu          a3, a3, a2
    addu          AT, AT, a2
    precrq.qb.ph  t0, t0, t0      // A A A A
    precrq.qb.ph  s0, s0, s0      // a a a a
    addu.qb       s0, s0, s8
    addu.qb       t0, t0, s8
    sw            s0, 0(a3)
    sw            s0, 4(a3)
    sw            t0, 0(AT)
    sw            t0, 4(AT)
    addiu         a0, a0, 32
    bne           a0, t9, 0b
     addiu        a1, a1, 8
    b             2f
     nop

1:
    precrq.ph.w   t3, s2, t2
    ins           t2, s2, 16, 16
    precrq.ph.w   t5, s4, t4
    ins           t4, s4, 16, 16
    precrq.ph.w   t7, s6, t6
    ins           t6, s6, 16, 16
    lw            t8, 4(AT)       // FIX(1.414213562)
    addq.ph       s4, t0, t4      // tmp10
    subq.ph       s5, t0, t4      // tmp11
    subq.ph       s6, t2, t6      // tmp12 ...
    addq.ph       s7, t2, t6      // tmp13
    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
    addq.ph       t0, s4, s7      // tmp0
    subq.ph       t6, s4, s7      // tmp3
    shll_s.ph     s6, s6, 1       // x2
    subq.ph       s6, s6, s7      // ... tmp12
    addq.ph       t2, s5, s6      // tmp1
    subq.ph       t4, s5, s6      // tmp2
    addq.ph       s5, t1, t7      // z11
    subq.ph       s6, t1, t7      // z12
    addq.ph       s7, t5, t3      // z13
    subq.ph       v0, t5, t3      // z10
    addq.ph       t7, s5, s7      // tmp7
    subq.ph       s5, s5, s7      // tmp11 ...
    addq.ph       v1, v0, s6      // z5 ...
    mulq_s.ph     s5, s5, t8      // ... tmp11
    lw            t8, 8(AT)       // FIX(1.847759065)
    lw            s4, 0(AT)       // FIX(1.082392200)
    addq.ph       s0, t0, t7      // tmp0 + tmp7
    subq.ph       s7, t0, t7      // tmp0 - tmp7
    mulq_s.ph     v1, v1, t8      // ... z5
    lw            a3, 0(a1)
    lw            t8, 12(AT)      // FIX(-2.613125930)
    shll_s.ph     s5, s5, 1       // x2
    addu          a3, a3, a2
    shll_s.ph     v0, v0, 1       // x4
    mulq_s.ph     v0, v0, t8      // tmp12 ...
    mulq_s.ph     s4, s6, s4      // tmp10 ...
    shll_s.ph     v1, v1, 1       // x2
    addiu         a0, a0, 32
    addiu         a1, a1, 8
    shll_s.ph     s6, v0, 1       // x4
    shll_s.ph     s4, s4, 1       // x2
    addq.ph       s6, s6, v1      // ... tmp12
    shll_s.ph     s0, s0, 2
    subq.ph       t5, s6, t7      // tmp6
    subq.ph       s4, s4, v1      // ... tmp10
    subq.ph       t3, s5, t5      // tmp5
    shll_s.ph     s7, s7, 2
    addq.ph       t1, s4, t3      // tmp4
    addq.ph       s1, t2, t5      // tmp1 + tmp6
    subq.ph       s6, t2, t5      // tmp1 - tmp6
    addq.ph       s2, t4, t3      // tmp2 + tmp5
    subq.ph       s5, t4, t3      // tmp2 - tmp5
    addq.ph       s4, t6, t1      // tmp3 + tmp4
    subq.ph       s3, t6, t1      // tmp3 - tmp4
    shll_s.ph     s1, s1, 2
    shll_s.ph     s2, s2, 2
    shll_s.ph     s3, s3, 2
    shll_s.ph     s4, s4, 2
    shll_s.ph     s5, s5, 2
    shll_s.ph     s6, s6, 2
    precrq.ph.w   t0, s1, s0      // B A
    ins           s0, s1, 16, 16  // b a
    precrq.ph.w   t2, s3, s2      // D C
    ins           s2, s3, 16, 16  // d c
    precrq.ph.w   t4, s5, s4      // F E
    ins           s4, s5, 16, 16  // f e
    precrq.ph.w   t6, s7, s6      // H G
    ins           s6, s7, 16, 16  // h g
    precrq.qb.ph  t0, t2, t0      // D C B A
    precrq.qb.ph  s0, s2, s0      // d c b a
    precrq.qb.ph  t4, t6, t4      // H G F E
    precrq.qb.ph  s4, s6, s4      // h g f e
    addu.qb       s0, s0, s8
    addu.qb       s4, s4, s8
    sw            s0, 0(a3)       // outptr[0/1/2/3]       d c b a
    sw            s4, 4(a3)       // outptr[4/5/6/7]       h g f e
    lw            a3, -4(a1)
    addu.qb       t0, t0, s8
    addu          a3, a3, a2
    addu.qb       t4, t4, s8
    sw            t0, 0(a3)       // outptr[0/1/2/3]       D C B A
    bne           a0, t9, 0b
     sw           t4, 4(a3)       // outptr[4/5/6/7]       H G F E

2:

    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3

    j             ra
     nop

END(jsimd_idct_ifast_rows_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_fdct_islow_dspr2)
/*
 * a0 = data
 */
    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8

    lui         t0, 6437
    ori         t0, 2260
    lui         t1, 9633
    ori         t1, 11363
    lui         t2, 0xd39e
    ori         t2, 0xe6dc
    lui         t3, 0xf72d
    ori         t3, 9633
    lui         t4, 2261
    ori         t4, 9633
    lui         t5, 0xd39e
    ori         t5, 6437
    lui         t6, 9633
    ori         t6, 0xd39d
    lui         t7, 0xe6dc
    ori         t7, 2260
    lui         t8, 4433
    ori         t8, 10703
    lui         t9, 0xd630
    ori         t9, 4433
    li          s8, 8
    move        a1, a0
1:
    lw          s0, 0(a1)       // tmp0 = 1|0
    lw          s1, 4(a1)       // tmp1 = 3|2
    lw          s2, 8(a1)       // tmp2 = 5|4
    lw          s3, 12(a1)      // tmp3 = 7|6
    packrl.ph   s1, s1, s1      // tmp1 = 2|3
    packrl.ph   s3, s3, s3      // tmp3 = 6|7
    subq.ph     s7, s1, s2      // tmp7 = 2-5|3-4 = t5|t4
    subq.ph     s5, s0, s3      // tmp5 = 1-6|0-7 = t6|t7
    mult        $0, $0          // ac0  = 0
    dpa.w.ph    $ac0, s7, t0    // ac0 += t5*  6437 + t4*  2260
    dpa.w.ph    $ac0, s5, t1    // ac0 += t6*  9633 + t7* 11363
    mult        $ac1, $0, $0    // ac1  = 0
    dpa.w.ph    $ac1, s7, t2    // ac1 += t5*-11362 + t4* -6436
    dpa.w.ph    $ac1, s5, t3    // ac1 += t6* -2259 + t7*  9633
    mult        $ac2, $0, $0    // ac2  = 0
    dpa.w.ph    $ac2, s7, t4    // ac2 += t5*  2261 + t4*  9633
    dpa.w.ph    $ac2, s5, t5    // ac2 += t6*-11362 + t7*  6437
    mult        $ac3, $0, $0    // ac3  = 0
    dpa.w.ph    $ac3, s7, t6    // ac3 += t5*  9633 + t4*-11363
    dpa.w.ph    $ac3, s5, t7    // ac3 += t6* -6436 + t7*  2260
    addq.ph     s6, s1, s2      // tmp6 = 2+5|3+4 = t2|t3
    addq.ph     s4, s0, s3      // tmp4 = 1+6|0+7 = t1|t0
    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
    extr_r.w    s2, $ac2, 11    // tmp2 = (ac2 + 1024) >> 11
    extr_r.w    s3, $ac3, 11    // tmp3 = (ac3 + 1024) >> 11
    addq.ph     s5, s4, s6      // tmp5 = t1+t2|t0+t3 = t11|t10
    subq.ph     s7, s4, s6      // tmp7 = t1-t2|t0-t3 = t12|t13
    sh          s0, 2(a1)
    sh          s1, 6(a1)
    sh          s2, 10(a1)
    sh          s3, 14(a1)
    mult        $0, $0          // ac0  = 0
    dpa.w.ph    $ac0, s7, t8    // ac0 += t12*  4433 + t13* 10703
    mult        $ac1, $0, $0    // ac1  = 0
    dpa.w.ph    $ac1, s7, t9    // ac1 += t12*-10704 + t13*  4433
    sra         s4, s5, 16      // tmp4 = t11
    addiu       a1, a1, 16
    addiu       s8, s8, -1
    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
    addu        s2, s5, s4      // tmp2 = t10 + t11
    subu        s3, s5, s4      // tmp3 = t10 - t11
    sll         s2, s2, 2       // tmp2 = (t10 + t11) << 2
    sll         s3, s3, 2       // tmp3 = (t10 - t11) << 2
    sh          s2, -16(a1)
    sh          s3, -8(a1)
    sh          s0, -12(a1)
    bgtz        s8, 1b
     sh         s1, -4(a1)
    li          t0, 2260
    li          t1, 11363
    li          t2, 9633
    li          t3, 6436
    li          t4, 6437
    li          t5, 2261
    li          t6, 11362
    li          t7, 2259
    li          t8, 4433
    li          t9, 10703
    li          a1, 10704
    li          s8, 8

2:
    lh          a2, 0(a0)       // 0
    lh          a3, 16(a0)      // 8
    lh          v0, 32(a0)      // 16
    lh          v1, 48(a0)      // 24
    lh          s4, 64(a0)      // 32
    lh          s5, 80(a0)      // 40
    lh          s6, 96(a0)      // 48
    lh          s7, 112(a0)     // 56
    addu        s2, v0, s5      // tmp2 = 16 + 40
    subu        s5, v0, s5      // tmp5 = 16 - 40
    addu        s3, v1, s4      // tmp3 = 24 + 32
    subu        s4, v1, s4      // tmp4 = 24 - 32
    addu        s0, a2, s7      // tmp0 =  0 + 56
    subu        s7, a2, s7      // tmp7 =  0 - 56
    addu        s1, a3, s6      // tmp1 =  8 + 48
    subu        s6, a3, s6      // tmp6 =  8 - 48
    addu        a2, s0, s3      // tmp10 = tmp0 + tmp3
    subu        v1, s0, s3      // tmp13 = tmp0 - tmp3
    addu        a3, s1, s2      // tmp11 = tmp1 + tmp2
    subu        v0, s1, s2      // tmp12 = tmp1 - tmp2
    mult        s7, t1          // ac0  = tmp7 * c1
    madd        s4, t0          // ac0 += tmp4 * c0
    madd        s5, t4          // ac0 += tmp5 * c4
    madd        s6, t2          // ac0 += tmp6 * c2
    mult        $ac1, s7, t2    // ac1  = tmp7 * c2
    msub        $ac1, s4, t3    // ac1 -= tmp4 * c3
    msub        $ac1, s5, t6    // ac1 -= tmp5 * c6
    msub        $ac1, s6, t7    // ac1 -= tmp6 * c7
    mult        $ac2, s7, t4    // ac2  = tmp7 * c4
    madd        $ac2, s4, t2    // ac2 += tmp4 * c2
    madd        $ac2, s5, t5    // ac2 += tmp5 * c5
    msub        $ac2, s6, t6    // ac2 -= tmp6 * c6
    mult        $ac3, s7, t0    // ac3  = tmp7 * c0
    msub        $ac3, s4, t1    // ac3 -= tmp4 * c1
    madd        $ac3, s5, t2    // ac3 += tmp5 * c2
    msub        $ac3, s6, t3    // ac3 -= tmp6 * c3
    extr_r.w    s0, $ac0, 15    // tmp0 = (ac0 + 16384) >> 15
    extr_r.w    s1, $ac1, 15    // tmp1 = (ac1 + 16384) >> 15
    extr_r.w    s2, $ac2, 15    // tmp2 = (ac2 + 16384) >> 15
    extr_r.w    s3, $ac3, 15    // tmp3 = (ac3 + 16384) >> 15
    addiu       s8, s8, -1
    addu        s4, a2, a3      // tmp4 = tmp10 + tmp11
    subu        s5, a2, a3      // tmp5 = tmp10 - tmp11
    sh          s0, 16(a0)
    sh          s1, 48(a0)
    sh          s2, 80(a0)
    sh          s3, 112(a0)
    mult        v0, t8          // ac0  = tmp12 * c8
    madd        v1, t9          // ac0 += tmp13 * c9
    mult        $ac1, v1, t8    // ac1  = tmp13 * c8
    msub        $ac1, v0, a1    // ac1 -= tmp12 * c10
    addiu       a0, a0, 2
    extr_r.w    s6, $ac0, 15    // tmp6 = (ac0 + 16384) >> 15
    extr_r.w    s7, $ac1, 15    // tmp7 = (ac1 + 16384) >> 15
    shra_r.w    s4, s4, 2       // tmp4 = (tmp4 + 2) >> 2
    shra_r.w    s5, s5, 2       // tmp5 = (tmp5 + 2) >> 2
    sh          s4, -2(a0)
    sh          s5, 62(a0)
    sh          s6, 30(a0)
    bgtz        s8, 2b
     sh         s7, 94(a0)

    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8

    jr          ra
     nop

END(jsimd_fdct_islow_dspr2)


/**************************************************************************/
LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
/*
 * a0 = data
 */
    .set at

    SAVE_REGS_ON_STACK 8, s0, s1

    li          a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
    li          a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
    li          a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
    li          s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)

    move        v0, a0
    addiu       v1, v0, 128     // end address

0:
    lw          t0, 0(v0)       // tmp0 = 1|0
    lw          t1, 4(v0)       // tmp1 = 3|2
    lw          t2, 8(v0)       // tmp2 = 5|4
    lw          t3, 12(v0)      // tmp3 = 7|6
    packrl.ph   t1, t1, t1      // tmp1 = 2|3
    packrl.ph   t3, t3, t3      // tmp3 = 6|7
    subq.ph     t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
    subq.ph     t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
    addq.ph     t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
    addq.ph     t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
    addq.ph     t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
    subq.ph     t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
    sra         t4, t8, 16      // tmp4 = t11
    mult        $0, $0          // ac0  = 0
    dpa.w.ph    $ac0, t9, s1
    mult        $ac1, $0, $0    // ac1  = 0
    dpa.w.ph    $ac1, t7, a3    // ac1 += t4*98 + t5*98
    dpsx.w.ph   $ac1, t5, a3    // ac1 += t6*98 + t7*98
    mult        $ac2, $0, $0    // ac2  = 0
    dpa.w.ph    $ac2, t7, a2    // ac2 += t4*139 + t5*139
    mult        $ac3, $0, $0    // ac3  = 0
    dpa.w.ph    $ac3, t5, a1    // ac3 += t6*334 + t7*334
    precrq.ph.w t0, t5, t7      // t0 = t5|t6
    addq.ph     t2, t8, t4      // tmp2 = t10 + t11
    subq.ph     t3, t8, t4      // tmp3 = t10 - t11
    extr.w      t4, $ac0, 8
    mult        $0, $0          // ac0  = 0
    dpa.w.ph    $ac0, t0, s1    // ac0 += t5*181 + t6*181
    extr.w      t0, $ac1, 8     // t0 = z5
    extr.w      t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
    extr.w      t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
    extr.w      t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
    add         t6, t1, t0      // t6 = z2
    add         t7, t7, t0      // t7 = z4
    subq.ph     t0, t5, t8      // t0 = z13 = tmp7 - z3
    addq.ph     t8, t5, t8      // t9 = z11 = tmp7 + z3
    addq.ph     t1, t0, t6      // t1 = z13 + z2
    subq.ph     t6, t0, t6      // t6 = z13 - z2
    addq.ph     t0, t8, t7      // t0 = z11 + z4
    subq.ph     t7, t8, t7      // t7 = z11 - z4
    addq.ph     t5, t4, t9
    subq.ph     t4, t9, t4
    sh          t2, 0(v0)
    sh          t5, 4(v0)
    sh          t3, 8(v0)
    sh          t4, 12(v0)
    sh          t1, 10(v0)
    sh          t6, 6(v0)
    sh          t0, 2(v0)
    sh          t7, 14(v0)
    addiu       v0, 16
    bne         v1, v0, 0b
     nop
    move        v0, a0
    addiu       v1, v0, 16

1:
    lh          t0, 0(v0)       // 0
    lh          t1, 16(v0)      // 8
    lh          t2, 32(v0)      // 16
    lh          t3, 48(v0)      // 24
    lh          t4, 64(v0)      // 32
    lh          t5, 80(v0)      // 40
    lh          t6, 96(v0)      // 48
    lh          t7, 112(v0)     // 56
    add         t8, t0, t7      // t8 = tmp0
    sub         t7, t0, t7      // t7 = tmp7
    add         t0, t1, t6      // t0 = tmp1
    sub         t1, t1, t6      // t1 = tmp6
    add         t6, t2, t5      // t6 = tmp2
    sub         t5, t2, t5      // t5 = tmp5
    add         t2, t3, t4      // t2 = tmp3
    sub         t3, t3, t4      // t3 = tmp4
    add         t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
    sub         t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
    sub         s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
    ins         t8, s0, 16, 16  // t8 = tmp12|tmp13
    add         t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
    mult        $0, $0          // ac0  = 0
    dpa.w.ph    $ac0, t8, s1    // ac0 += t12*181 + t13*181
    add         s0, t4, t2      // t8 = tmp10+tmp11
    sub         t4, t4, t2      // t4 = tmp10-tmp11
    sh          s0, 0(v0)
    sh          t4, 64(v0)
    extr.w      t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
    addq.ph     t4, t8, t2      // t9 = tmp13 + z1
    subq.ph     t8, t8, t2      // t2 = tmp13 - z1
    sh          t4, 32(v0)
    sh          t8, 96(v0)
    add         t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
    add         t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
    add         t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
    andi        t4, a1, 0xffff
    mul         s0, t1, t4
    sra         s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
    ins         t1, t3, 16, 16  // t1 = tmp10|tmp12
    mult        $0, $0          // ac0  = 0
    mulsa.w.ph  $ac0, t1, a3    // ac0 += t10*98 - t12*98
    extr.w      t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
    add         t2, t7, t8      // t2 = tmp7 + z5
    sub         t7, t7, t8      // t7 = tmp7 - z5
    andi        t4, a2, 0xffff
    mul         t8, t3, t4
    sra         t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
    andi        t4, s1, 0xffff
    mul         t6, t0, t4
    sra         t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
    add         t0, t6, t8      // t0 = z3 + z2
    sub         t1, t6, t8      // t1 = z3 - z2
    add         t3, t6, s0      // t3 = z3 + z4
    sub         t4, t6, s0      // t4 = z3 - z4
    sub         t5, t2, t1      // t5 = dataptr[5]
    sub         t6, t7, t0      // t6 = dataptr[3]
    add         t3, t2, t3      // t3 = dataptr[1]
    add         t4, t7, t4      // t4 = dataptr[7]
    sh          t5, 80(v0)
    sh          t6, 48(v0)
    sh          t3, 16(v0)
    sh          t4, 112(v0)
    addiu       v0, 2
    bne         v0, v1, 1b
     nop

    RESTORE_REGS_FROM_STACK 8, s0, s1

    j           ra
     nop
END(jsimd_fdct_ifast_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_quantize_dspr2)
/*
 * a0 = coef_block
 * a1 = divisors
 * a2 = workspace
 */
    .set at

    SAVE_REGS_ON_STACK 16, s0, s1, s2

    addiu       v0, a2, 124     // v0 = workspace_end
    lh          t0, 0(a2)
    lh          t1, 0(a1)
    lh          t2, 128(a1)
    sra         t3, t0, 15
    sll         t3, t3, 1
    addiu       t3, t3, 1
    mul         t0, t0, t3
    lh          t4, 384(a1)
    lh          t5, 130(a1)
    lh          t6, 2(a2)
    lh          t7, 2(a1)
    lh          t8, 386(a1)

1:
    andi        t1, 0xffff
    add         t9, t0, t2
    andi        t9, 0xffff
    mul         v1, t9, t1
    sra         s0, t6, 15
    sll         s0, s0, 1
    addiu       s0, s0, 1
    addiu       t9, t4, 16
    srav        v1, v1, t9
    mul         v1, v1, t3
    mul         t6, t6, s0
    andi        t7, 0xffff
    addiu       a2, a2, 4
    addiu       a1, a1, 4
    add         s1, t6, t5
    andi        s1, 0xffff
    sh          v1, 0(a0)

    mul         s2, s1, t7
    addiu       s1, t8, 16
    srav        s2, s2, s1
    mul         s2, s2, s0
    lh          t0, 0(a2)
    lh          t1, 0(a1)
    sra         t3, t0, 15
    sll         t3, t3, 1
    addiu       t3, t3, 1
    mul         t0, t0, t3
    lh          t2, 128(a1)
    lh          t4, 384(a1)
    lh          t5, 130(a1)
    lh          t8, 386(a1)
    lh          t6, 2(a2)
    lh          t7, 2(a1)
    sh          s2, 2(a0)
    lh          t0, 0(a2)
    sra         t3, t0, 15
    sll         t3, t3, 1
    addiu       t3, t3, 1
    mul         t0, t0, t3
    bne         a2, v0, 1b
     addiu      a0, a0, 4

    andi        t1, 0xffff
    add         t9, t0, t2
    andi        t9, 0xffff
    mul         v1, t9, t1
    sra         s0, t6, 15
    sll         s0, s0, 1
    addiu       s0, s0, 1
    addiu       t9, t4, 16
    srav        v1, v1, t9
    mul         v1, v1, t3
    mul         t6, t6, s0
    andi        t7, 0xffff
    sh          v1, 0(a0)
    add         s1, t6, t5
    andi        s1, 0xffff
    mul         s2, s1, t7
    addiu       s1, t8, 16
    addiu       a2, a2, 4
    addiu       a1, a1, 4
    srav        s2, s2, s1
    mul         s2, s2, s0
    sh          s2, 2(a0)

    RESTORE_REGS_FROM_STACK 16, s0, s1, s2

    j           ra
     nop

END(jsimd_quantize_dspr2)


#ifndef __mips_soft_float

/*****************************************************************************/
LEAF_DSPR2(jsimd_quantize_float_dspr2)
/*
 * a0 = coef_block
 * a1 = divisors
 * a2 = workspace
 */
    .set at

    li          t1, 0x46800100  // integer representation 16384.5
    mtc1        t1, f0
    li          t0, 63
0:
    lwc1        f2, 0(a2)
    lwc1        f10, 0(a1)
    lwc1        f4, 4(a2)
    lwc1        f12, 4(a1)
    lwc1        f6, 8(a2)
    lwc1        f14, 8(a1)
    lwc1        f8, 12(a2)
    lwc1        f16, 12(a1)
    madd.s      f2, f0, f2, f10
    madd.s      f4, f0, f4, f12
    madd.s      f6, f0, f6, f14
    madd.s      f8, f0, f8, f16
    lwc1        f10, 16(a1)
    lwc1        f12, 20(a1)
    trunc.w.s   f2, f2
    trunc.w.s   f4, f4
    trunc.w.s   f6, f6
    trunc.w.s   f8, f8
    lwc1        f14, 24(a1)
    lwc1        f16, 28(a1)
    mfc1        t1, f2
    mfc1        t2, f4
    mfc1        t3, f6
    mfc1        t4, f8
    lwc1        f2, 16(a2)
    lwc1        f4, 20(a2)
    lwc1        f6, 24(a2)
    lwc1        f8, 28(a2)
    madd.s      f2, f0, f2, f10
    madd.s      f4, f0, f4, f12
    madd.s      f6, f0, f6, f14
    madd.s      f8, f0, f8, f16
    addiu       t1, t1, -16384
    addiu       t2, t2, -16384
    addiu       t3, t3, -16384
    addiu       t4, t4, -16384
    trunc.w.s   f2, f2
    trunc.w.s   f4, f4
    trunc.w.s   f6, f6
    trunc.w.s   f8, f8
    sh          t1, 0(a0)
    sh          t2, 2(a0)
    sh          t3, 4(a0)
    sh          t4, 6(a0)
    mfc1        t1, f2
    mfc1        t2, f4
    mfc1        t3, f6
    mfc1        t4, f8
    addiu       t0, t0, -8
    addiu       a2, a2, 32
    addiu       a1, a1, 32
    addiu       t1, t1, -16384
    addiu       t2, t2, -16384
    addiu       t3, t3, -16384
    addiu       t4, t4, -16384
    sh          t1, 8(a0)
    sh          t2, 10(a0)
    sh          t3, 12(a0)
    sh          t4, 14(a0)
    bgez        t0, 0b
     addiu      a0, a0, 16

    j           ra
     nop

END(jsimd_quantize_float_dspr2)

#endif


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_2x2_dspr2)
/*
 * a0 = compptr->dct_table
 * a1 = coef_block
 * a2 = output_buf
 * a3 = output_col
 */
    .set at

    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5

    addiu       sp, sp, -40
    move        v0, sp
    addiu       s2, zero, 29692
    addiu       s3, zero, -10426
    addiu       s4, zero, 6967
    addiu       s5, zero, -5906
    lh          t0, 0(a1)       // t0 = inptr[DCTSIZE*0]
    lh          t5, 0(a0)       // t5 = quantptr[DCTSIZE*0]
    lh          t1, 48(a1)      // t1 = inptr[DCTSIZE*3]
    lh          t6, 48(a0)      // t6 = quantptr[DCTSIZE*3]
    mul         t4, t5, t0
    lh          t0, 16(a1)      // t0 = inptr[DCTSIZE*1]
    lh          t5, 16(a0)      // t5 = quantptr[DCTSIZE*1]
    mul         t6, t6, t1
    mul         t5, t5, t0
    lh          t2, 80(a1)      // t2 = inptr[DCTSIZE*5]
    lh          t7, 80(a0)      // t7 = quantptr[DCTSIZE*5]
    lh          t3, 112(a1)     // t3 = inptr[DCTSIZE*7]
    lh          t8, 112(a0)     // t8 = quantptr[DCTSIZE*7]
    mul         t7, t7, t2
    mult        zero, zero
    mul         t8, t8, t3
    li          s0, 0x73FCD746  // s0 = (29692 << 16) | (-10426 & 0xffff)
    li          s1, 0x1B37E8EE  // s1 = (6967 << 16) | (-5906 & 0xffff)
    ins         t6, t5, 16, 16  // t6 = t5|t6
    sll         t4, t4, 15
    dpa.w.ph    $ac0, t6, s0
    lh          t1, 2(a1)
    lh          t6, 2(a0)
    ins         t8, t7, 16, 16  // t8 = t7|t8
    dpa.w.ph    $ac0, t8, s1
    mflo        t0, $ac0
    mul         t5, t6, t1
    lh          t1, 18(a1)
    lh          t6, 18(a0)
    lh          t2, 50(a1)
    lh          t7, 50(a0)
    mul         t6, t6, t1
    subu        t8, t4, t0
    mul         t7, t7, t2
    addu        t0, t4, t0
    shra_r.w    t0, t0, 13
    lh          t1, 82(a1)
    lh          t2, 82(a0)
    lh          t3, 114(a1)
    lh          t4, 114(a0)
    shra_r.w    t8, t8, 13
    mul         t1, t1, t2
    mul         t3, t3, t4
    sw          t0, 0(v0)
    sw          t8, 20(v0)
    sll         t4, t5, 15
    ins         t7, t6, 16, 16
    mult        zero, zero
    dpa.w.ph    $ac0, t7, s0
    ins         t3, t1, 16, 16
    lh          t1, 6(a1)
    lh          t6, 6(a0)
    dpa.w.ph    $ac0, t3, s1
    mflo        t0, $ac0
    mul         t5, t6, t1
    lh          t1, 22(a1)
    lh          t6, 22(a0)
    lh          t2, 54(a1)
    lh          t7, 54(a0)
    mul         t6, t6, t1
    subu        t8, t4, t0
    mul         t7, t7, t2
    addu        t0, t4, t0
    shra_r.w    t0, t0, 13
    lh          t1, 86(a1)
    lh          t2, 86(a0)
    lh          t3, 118(a1)
    lh          t4, 118(a0)
    shra_r.w    t8, t8, 13
    mul         t1, t1, t2
    mul         t3, t3, t4
    sw          t0, 4(v0)
    sw          t8, 24(v0)
    sll         t4, t5, 15
    ins         t7, t6, 16, 16
    mult        zero, zero
    dpa.w.ph    $ac0, t7, s0
    ins         t3, t1, 16, 16
    lh          t1, 10(a1)
    lh          t6, 10(a0)
    dpa.w.ph    $ac0, t3, s1
    mflo        t0, $ac0
    mul         t5, t6, t1
    lh          t1, 26(a1)
    lh          t6, 26(a0)
    lh          t2, 58(a1)
    lh          t7, 58(a0)
    mul         t6, t6, t1
    subu        t8, t4, t0
    mul         t7, t7, t2
    addu        t0, t4, t0
    shra_r.w    t0, t0, 13
    lh          t1, 90(a1)
    lh          t2, 90(a0)
    lh          t3, 122(a1)
    lh          t4, 122(a0)
    shra_r.w    t8, t8, 13
    mul         t1, t1, t2
    mul         t3, t3, t4
    sw          t0, 8(v0)
    sw          t8, 28(v0)
    sll         t4, t5, 15
    ins         t7, t6, 16, 16
    mult        zero, zero
    dpa.w.ph    $ac0, t7, s0
    ins         t3, t1, 16, 16
    lh          t1, 14(a1)
    lh          t6, 14(a0)
    dpa.w.ph    $ac0, t3, s1
    mflo        t0, $ac0
    mul         t5, t6, t1
    lh          t1, 30(a1)
    lh          t6, 30(a0)
    lh          t2, 62(a1)
    lh          t7, 62(a0)
    mul         t6, t6, t1
    subu        t8, t4, t0
    mul         t7, t7, t2
    addu        t0, t4, t0
    shra_r.w    t0, t0, 13
    lh          t1, 94(a1)
    lh          t2, 94(a0)
    lh          t3, 126(a1)
    lh          t4, 126(a0)
    shra_r.w    t8, t8, 13
    mul         t1, t1, t2
    mul         t3, t3, t4
    sw          t0, 12(v0)
    sw          t8, 32(v0)
    sll         t4, t5, 15
    ins         t7, t6, 16, 16
    mult        zero, zero
    dpa.w.ph    $ac0, t7, s0
    ins         t3, t1, 16, 16
    dpa.w.ph    $ac0, t3, s1
    mflo        t0, $ac0
    lw          t9, 0(a2)
    lw          t3, 0(v0)
    lw          t7, 4(v0)
    lw          t1, 8(v0)
    addu        t9, t9, a3
    sll         t3, t3, 15
    subu        t8, t4, t0
    addu        t0, t4, t0
    shra_r.w    t0, t0, 13
    shra_r.w    t8, t8, 13
    sw          t0, 16(v0)
    sw          t8, 36(v0)
    lw          t5, 12(v0)
    lw          t6, 16(v0)
    mult        t7, s2
    madd        t1, s3
    madd        t5, s4
    madd        t6, s5
    lw          t5, 24(v0)
    lw          t7, 28(v0)
    mflo        t0, $ac0
    lw          t8, 32(v0)
    lw          t2, 36(v0)
    mult        $ac1, t5, s2
    madd        $ac1, t7, s3
    madd        $ac1, t8, s4
    madd        $ac1, t2, s5
    addu        t1, t3, t0
    subu        t6, t3, t0
    shra_r.w    t1, t1, 20
    shra_r.w    t6, t6, 20
    mflo        t4, $ac1
    shll_s.w    t1, t1, 24
    shll_s.w    t6, t6, 24
    sra         t1, t1, 24
    sra         t6, t6, 24
    addiu       t1, t1, 128
    addiu       t6, t6, 128
    lw          t0, 20(v0)
    sb          t1, 0(t9)
    sb          t6, 1(t9)
    sll         t0, t0, 15
    lw          t9, 4(a2)
    addu        t1, t0, t4
    subu        t6, t0, t4
    addu        t9, t9, a3
    shra_r.w    t1, t1, 20
    shra_r.w    t6, t6, 20
    shll_s.w    t1, t1, 24
    shll_s.w    t6, t6, 24
    sra         t1, t1, 24
    sra         t6, t6, 24
    addiu       t1, t1, 128
    addiu       t6, t6, 128
    sb          t1, 0(t9)
    sb          t6, 1(t9)
    addiu       sp, sp, 40

    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5

    j           ra
     nop

END(jsimd_idct_2x2_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_4x4_dspr2)
/*
 * a0     = compptr->dct_table
 * a1     = coef_block
 * a2     = output_buf
 * a3     = output_col
 * 16(sp) = workspace[DCTSIZE*4];  // buffers data between passes
 */
    .set at

    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    lw          v1, 48(sp)
    move        t0, a1
    move        t1, v1
    li          t9, 4
    li          s0, 0x2e75f93e
    li          s1, 0x21f9ba79
    li          s2, 0xecc2efb0
    li          s3, 0x52031ccd

0:
    lh          s6, 32(t0)      // inptr[DCTSIZE*2]
    lh          t6, 32(a0)      // quantptr[DCTSIZE*2]
    lh          s7, 96(t0)      // inptr[DCTSIZE*6]
    lh          t7, 96(a0)      // quantptr[DCTSIZE*6]
    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
    lh          s4, 0(t0)       // inptr[DCTSIZE*0]
    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
    lh          s5, 0(a0)       // quantptr[0]
    li          s6, 15137
    li          s7, 6270
    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
    lh          t5, 112(t0)     // inptr[DCTSIZE*7]
    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
    lh          s4, 112(a0)     // quantptr[DCTSIZE*7]
    lh          v0, 80(t0)      // inptr[DCTSIZE*5]
    lh          s5, 80(a0)      // quantptr[DCTSIZE*5]
    lh          s6, 48(a0)      // quantptr[DCTSIZE*3]
    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
    lh          s7, 16(a0)      // quantptr[DCTSIZE*1]
    lh          t8, 16(t0)      // inptr[DCTSIZE*1]
    subu        t6, t6, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
    lh          t7, 48(t0)      // inptr[DCTSIZE*3]
    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
    mul         v0, s5, v0      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
    addu        t3, t2, t6      // tmp10 = tmp0 + z2
    subu        t4, t2, t6      // tmp10 = tmp0 - z2
    mult        $ac0, zero, zero
    mult        $ac1, zero, zero
    ins         t5, v0, 16, 16
    ins         t7, t8, 16, 16
    addiu       t9, t9, -1
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    mflo        s4, $ac0
    mflo        s5, $ac1
    addiu       a0, a0, 2
    addiu       t1, t1, 4
    addiu       t0, t0, 2
    addu        t6, t4, s4
    subu        t5, t4, s4
    addu        s6, t3, s5
    subu        s7, t3, s5
    shra_r.w    t6, t6, 12      // DESCALE(tmp12 + temp1, 12)
    shra_r.w    t5, t5, 12      // DESCALE(tmp12 - temp1, 12)
    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
    sw          t6, 28(t1)
    sw          t5, 60(t1)
    sw          s6, -4(t1)
    bgtz        t9, 0b
     sw         s7, 92(t1)
    // second loop three pass
    li          t9, 3
1:
    lh          s6, 34(t0)      // inptr[DCTSIZE*2]
    lh          t6, 34(a0)      // quantptr[DCTSIZE*2]
    lh          s7, 98(t0)      // inptr[DCTSIZE*6]
    lh          t7, 98(a0)      // quantptr[DCTSIZE*6]
    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
    lh          s4, 2(t0)       // inptr[DCTSIZE*0]
    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
    lh          s5, 2(a0)       // quantptr[DCTSIZE*0]
    li          s6, 15137
    li          s7, 6270
    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
    mul         v0, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
    lh          t5, 114(t0)     // inptr[DCTSIZE*7]
    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
    lh          s4, 114(a0)     // quantptr[DCTSIZE*7]
    lh          s5, 82(a0)      // quantptr[DCTSIZE*5]
    lh          t6, 82(t0)      // inptr[DCTSIZE*5]
    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
    lh          s6, 50(a0)      // quantptr[DCTSIZE*3]
    lh          t8, 18(t0)      // inptr[DCTSIZE*1]
    subu        v0, v0, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
    lh          t7, 50(t0)      // inptr[DCTSIZE*3]
    lh          s7, 18(a0)      // quantptr[DCTSIZE*1]
    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
    mul         t6, s5, t6      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
    addu        t3, t2, v0      // tmp10 = tmp0 + z2
    subu        t4, t2, v0      // tmp10 = tmp0 - z2
    mult        $ac0, zero, zero
    mult        $ac1, zero, zero
    ins         t5, t6, 16, 16
    ins         t7, t8, 16, 16
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    mflo        t5, $ac0
    mflo        t6, $ac1
    addiu       t9, t9, -1
    addiu       t0, t0, 2
    addiu       a0, a0, 2
    addiu       t1, t1, 4
    addu        s5, t4, t5
    subu        s4, t4, t5
    addu        s6, t3, t6
    subu        s7, t3, t6
    shra_r.w    s5, s5, 12      // DESCALE(tmp12 + temp1, 12)
    shra_r.w    s4, s4, 12      // DESCALE(tmp12 - temp1, 12)
    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
    sw          s5, 32(t1)
    sw          s4, 64(t1)
    sw          s6, 0(t1)
    bgtz        t9, 1b
     sw         s7, 96(t1)
    move        t1, v1
    li          s4, 15137
    lw          s6, 8(t1)       // wsptr[2]
    li          s5, 6270
    lw          s7, 24(t1)      // wsptr[6]
    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
    lw          t2, 0(t1)       // wsptr[0]
    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
    lh          t5, 28(t1)      // wsptr[7]
    lh          t6, 20(t1)      // wsptr[5]
    lh          t7, 12(t1)      // wsptr[3]
    lh          t8, 4(t1)       // wsptr[1]
    ins         t5, t6, 16, 16
    ins         t7, t8, 16, 16
    mult        $ac0, zero, zero
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    mult        $ac1, zero, zero
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
    mflo        s6, $ac0
    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
    subu        s4, s4, s5
    addu        t3, t2, s4      // tmp10 = tmp0 + z2
    mflo        s7, $ac1
    subu        t4, t2, s4      // tmp10 = tmp0 - z2
    addu        t7, t4, s6
    subu        t8, t4, s6
    addu        t5, t3, s7
    subu        t6, t3, s7
    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
    sll         s4, t9, 2
    lw          v0, 0(a2)       // output_buf[ctr]
    shll_s.w    t5, t5, 24
    shll_s.w    t6, t6, 24
    shll_s.w    t7, t7, 24
    shll_s.w    t8, t8, 24
    sra         t5, t5, 24
    sra         t6, t6, 24
    sra         t7, t7, 24
    sra         t8, t8, 24
    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
    addiu       t5, t5, 128
    addiu       t6, t6, 128
    addiu       t7, t7, 128
    addiu       t8, t8, 128
    sb          t5, 0(v0)
    sb          t7, 1(v0)
    sb          t8, 2(v0)
    sb          t6, 3(v0)
    // 2
    li          s4, 15137
    lw          s6, 40(t1)      // wsptr[2]
    li          s5, 6270
    lw          s7, 56(t1)      // wsptr[6]
    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
    lw          t2, 32(t1)      // wsptr[0]
    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
    lh          t5, 60(t1)      // wsptr[7]
    lh          t6, 52(t1)      // wsptr[5]
    lh          t7, 44(t1)      // wsptr[3]
    lh          t8, 36(t1)      // wsptr[1]
    ins         t5, t6, 16, 16
    ins         t7, t8, 16, 16
    mult        $ac0, zero, zero
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    mult        $ac1, zero, zero
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
    mflo        s6, $ac0
    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
    subu        s4, s4, s5
    addu        t3, t2, s4      // tmp10 = tmp0 + z2
    mflo        s7, $ac1
    subu        t4, t2, s4      // tmp10 = tmp0 - z2
    addu        t7, t4, s6
    subu        t8, t4, s6
    addu        t5, t3, s7
    subu        t6, t3, s7
    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
    sll         s4, t9, 2
    lw          v0, 4(a2)       // output_buf[ctr]
    shll_s.w    t5, t5, 24
    shll_s.w    t6, t6, 24
    shll_s.w    t7, t7, 24
    shll_s.w    t8, t8, 24
    sra         t5, t5, 24
    sra         t6, t6, 24
    sra         t7, t7, 24
    sra         t8, t8, 24
    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
    addiu       t5, t5, 128
    addiu       t6, t6, 128
    addiu       t7, t7, 128
    addiu       t8, t8, 128
    sb          t5, 0(v0)
    sb          t7, 1(v0)
    sb          t8, 2(v0)
    sb          t6, 3(v0)
    // 3
    li          s4, 15137
    lw          s6, 72(t1)      // wsptr[2]
    li          s5, 6270
    lw          s7, 88(t1)      // wsptr[6]
    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
    lw          t2, 64(t1)      // wsptr[0]
    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
    lh          t5, 92(t1)      // wsptr[7]
    lh          t6, 84(t1)      // wsptr[5]
    lh          t7, 76(t1)      // wsptr[3]
    lh          t8, 68(t1)      // wsptr[1]
    ins         t5, t6, 16, 16
    ins         t7, t8, 16, 16
    mult        $ac0, zero, zero
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    mult        $ac1, zero, zero
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
    mflo        s6, $ac0
    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
    subu        s4, s4, s5
    addu        t3, t2, s4      // tmp10 = tmp0 + z2
    mflo        s7, $ac1
    subu        t4, t2, s4      // tmp10 = tmp0 - z2
    addu        t7, t4, s6
    subu        t8, t4, s6
    addu        t5, t3, s7
    subu        t6, t3, s7
    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
    sll         s4, t9, 2
    lw          v0, 8(a2)       // output_buf[ctr]
    shll_s.w    t5, t5, 24
    shll_s.w    t6, t6, 24
    shll_s.w    t7, t7, 24
    shll_s.w    t8, t8, 24
    sra         t5, t5, 24
    sra         t6, t6, 24
    sra         t7, t7, 24
    sra         t8, t8, 24
    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
    addiu       t5, t5, 128
    addiu       t6, t6, 128
    addiu       t7, t7, 128
    addiu       t8, t8, 128
    sb          t5, 0(v0)
    sb          t7, 1(v0)
    sb          t8, 2(v0)
    sb          t6, 3(v0)
    li          s4, 15137
    lw          s6, 104(t1)     // wsptr[2]
    li          s5, 6270
    lw          s7, 120(t1)     // wsptr[6]
    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
    lw          t2, 96(t1)      // wsptr[0]
    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
    lh          t5, 124(t1)     // wsptr[7]
    lh          t6, 116(t1)     // wsptr[5]
    lh          t7, 108(t1)     // wsptr[3]
    lh          t8, 100(t1)     // wsptr[1]
    ins         t5, t6, 16, 16
    ins         t7, t8, 16, 16
    mult        $ac0, zero, zero
    dpa.w.ph    $ac0, t5, s0
    dpa.w.ph    $ac0, t7, s1
    mult        $ac1, zero, zero
    dpa.w.ph    $ac1, t5, s2
    dpa.w.ph    $ac1, t7, s3
    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
    mflo        s6, $ac0
    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
    subu        s4, s4, s5
    addu        t3, t2, s4      // tmp10 = tmp0 + z2;
    mflo        s7, $ac1
    subu        t4, t2, s4      // tmp10 = tmp0 - z2;
    addu        t7, t4, s6
    subu        t8, t4, s6
    addu        t5, t3, s7
    subu        t6, t3, s7
    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
    sll         s4, t9, 2
    lw          v0, 12(a2)      // output_buf[ctr]
    shll_s.w    t5, t5, 24
    shll_s.w    t6, t6, 24
    shll_s.w    t7, t7, 24
    shll_s.w    t8, t8, 24
    sra         t5, t5, 24
    sra         t6, t6, 24
    sra         t7, t7, 24
    sra         t8, t8, 24
    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
    addiu       t5, t5, 128
    addiu       t6, t6, 128
    addiu       t7, t7, 128
    addiu       t8, t8, 128
    sb          t5, 0(v0)
    sb          t7, 1(v0)
    sb          t8, 2(v0)
    sb          t6, 3(v0)

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop
END(jsimd_idct_4x4_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_6x6_dspr2)
/*
 * a0 = compptr->dct_table
 * a1 = coef_block
 * a2 = output_buf
 * a3 = output_col
 */
    .set at

    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    addiu       sp, sp, -144
    move        v0, sp
    addiu       v1, v0, 24
    addiu       t9, zero, 5793
    addiu       s0, zero, 10033
    addiu       s1, zero, 2998

1:
    lh          s2, 0(a0)       // q0 = quantptr[ 0]
    lh          s3, 32(a0)      // q1 = quantptr[16]
    lh          s4, 64(a0)      // q2 = quantptr[32]
    lh          t2, 64(a1)      // tmp2 = inptr[32]
    lh          t1, 32(a1)      // tmp1 = inptr[16]
    lh          t0, 0(a1)       // tmp0 = inptr[ 0]
    mul         t2, t2, s4      // tmp2 = tmp2 * q2
    mul         t1, t1, s3      // tmp1 = tmp1 * q1
    mul         t0, t0, s2      // tmp0 = tmp0 * q0
    lh          t6, 16(a1)      // z1 = inptr[ 8]
    lh          t8, 80(a1)      // z3 = inptr[40]
    lh          t7, 48(a1)      // z2 = inptr[24]
    lh          s2, 16(a0)      // q0 = quantptr[ 8]
    lh          s4, 80(a0)      // q2 = quantptr[40]
    lh          s3, 48(a0)      // q1 = quantptr[24]
    mul         t2, t2, t9      // tmp2 = tmp2 * 5793
    mul         t1, t1, s0      // tmp1 = tmp1 * 10033
    sll         t0, t0, 13      // tmp0 = tmp0 << 13
    mul         t6, t6, s2      // z1 = z1 * q0
    mul         t8, t8, s4      // z3 = z3 * q2
    mul         t7, t7, s3      // z2 = z2 * q1
    addu        t3, t0, t2      // tmp10 = tmp0 + tmp2
    sll         t2, t2, 1       // tmp2 = tmp2 << 2
    subu        t4, t0, t2      // tmp11 = tmp0 - tmp2;
    subu        t5, t3, t1      // tmp12 = tmp10 - tmp1
    addu        t3, t3, t1      // tmp10 = tmp10 + tmp1
    addu        t1, t6, t8      // tmp1 = z1 + z3
    mul         t1, t1, s1      // tmp1 = tmp1 * 2998
    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
    subu        t2, t6, t8      // tmp2 = z1 - z3
    subu        t2, t2, t7      // tmp2 = tmp2 - z2
    sll         t2, t2, 2       // tmp2 = tmp2 << 2
    addu        t0, t6, t7      // tmp0 = z1 + z2
    sll         t0, t0, 13      // tmp0 = tmp0 << 13
    subu        s2, t8, t7      // q0 = z3 - z2
    sll         s2, s2, 13      // q0 = q0 << 13
    addu        t0, t0, t1      // tmp0 = tmp0 + tmp1
    addu        t1, s2, t1      // tmp1 = q0 + tmp1
    addu        s2, t4, t2      // q0 = tmp11 + tmp2
    subu        s3, t4, t2      // q1 = tmp11 - tmp2
    addu        t6, t3, t0      // z1 = tmp10 + tmp0
    subu        t7, t3, t0      // z2 = tmp10 - tmp0
    addu        t4, t5, t1      // tmp11 = tmp12 + tmp1
    subu        t5, t5, t1      // tmp12 = tmp12 - tmp1
    shra_r.w    t6, t6, 11      // z1 = (z1 + 1024) >> 11
    shra_r.w    t7, t7, 11      // z2 = (z2 + 1024) >> 11
    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
    shra_r.w    t5, t5, 11      // tmp12 = (tmp12 + 1024) >> 11
    sw          s2, 24(v0)
    sw          s3, 96(v0)
    sw          t6, 0(v0)
    sw          t7, 120(v0)
    sw          t4, 48(v0)
    sw          t5, 72(v0)
    addiu       v0, v0, 4
    addiu       a1, a1, 2
    bne         v0, v1, 1b
     addiu      a0, a0, 2

    /* Pass 2: process 6 rows from work array, store into output array. */
    move        v0, sp
    addiu       v1, v0, 144

2:
    lw          t0, 0(v0)
    lw          t2, 16(v0)
    lw          s5, 0(a2)
    addiu       t0, t0, 16
    sll         t0, t0, 13
    mul         t3, t2, t9
    lw          t6, 4(v0)
    lw          t8, 20(v0)
    lw          t7, 12(v0)
    addu        s5, s5, a3
    addu        s6, t6, t8
    mul         s6, s6, s1
    addu        t1, t0, t3
    subu        t4, t0, t3
    subu        t4, t4, t3
    lw          t3, 8(v0)
    mul         t0, t3, s0
    addu        s7, t6, t7
    sll         s7, s7, 13
    addu        s7, s6, s7
    subu        t2, t8, t7
    sll         t2, t2, 13
    addu        t2, s6, t2
    subu        s6, t6, t7
    subu        s6, s6, t8
    sll         s6, s6, 13
    addu        t3, t1, t0
    subu        t5, t1, t0
    addu        t6, t3, s7
    subu        t3, t3, s7
    addu        t7, t4, s6
    subu        t4, t4, s6
    addu        t8, t5, t2
    subu        t5, t5, t2
    shll_s.w    t6, t6, 6
    shll_s.w    t3, t3, 6
    shll_s.w    t7, t7, 6
    shll_s.w    t4, t4, 6
    shll_s.w    t8, t8, 6
    shll_s.w    t5, t5, 6
    sra         t6, t6, 24
    addiu       t6, t6, 128
    sra         t3, t3, 24
    addiu       t3, t3, 128
    sb          t6, 0(s5)
    sra         t7, t7, 24
    addiu       t7, t7, 128
    sb          t3, 5(s5)
    sra         t4, t4, 24
    addiu       t4, t4, 128
    sb          t7, 1(s5)
    sra         t8, t8, 24
    addiu       t8, t8, 128
    sb          t4, 4(s5)
    addiu       v0, v0, 24
    sra         t5, t5, 24
    addiu       t5, t5, 128
    sb          t8, 2(s5)
    addiu       a2, a2,  4
    bne         v0, v1, 2b
     sb         t5, 3(s5)

    addiu       sp, sp, 144

    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

    j           ra
     nop

END(jsimd_idct_6x6_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
/*
 * a0 = compptr->dct_table
 * a1 = coef_block
 * a2 = workspace
 */
    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

    li          a3, 8

1:
    // odd part
    lh          t0, 48(a1)
    lh          t1, 48(a0)
    lh          t2, 16(a1)
    lh          t3, 16(a0)
    lh          t4, 80(a1)
    lh          t5, 80(a0)
    lh          t6, 112(a1)
    lh          t7, 112(a0)
    mul         t0, t0, t1      // z2
    mul         t1, t2, t3      // z1
    mul         t2, t4, t5      // z3
    mul         t3, t6, t7      // z4
    li          t4, 10703       // FIX(1.306562965)
    li          t5, 4433        // FIX_0_541196100
    li          t6, 7053        // FIX(0.860918669)
    mul         t4, t0, t4      // tmp11
    mul         t5, t0, t5      // -tmp14
    addu        t7, t1, t2      // tmp10
    addu        t8, t7, t3      // tmp10 + z4
    mul         t6, t6, t8      // tmp15
    li          t8, 2139        // FIX(0.261052384)
    mul         t8, t7, t8      // MULTIPLY(tmp10, FIX(0.261052384))
    li          t7, 2295        // FIX(0.280143716)
    mul         t7, t1, t7      // MULTIPLY(z1, FIX(0.280143716))
    addu        t9, t2, t3      // z3 + z4
    li          s0, 8565        // FIX(1.045510580)
    mul         t9, t9, s0      // -tmp13
    li          s0, 12112       // FIX(1.478575242)
    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242)
    li          s1, 12998       // FIX(1.586706681)
    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
    li          s2, 5540        // FIX(0.676326758)
    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
    li          s3, 16244       // FIX(1.982889723)
    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
    subu        t1, t1, t3      // z1-=z4
    subu        t0, t0, t2      // z2-=z3
    addu        t2, t0, t1      // z1+z2
    li          t3, 4433        // FIX_0_541196100
    mul         t2, t2, t3      // z3
    li          t3, 6270        // FIX_0_765366865
    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
    li          t3, 15137       // FIX_0_765366865
    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
    addu        t8, t6, t8      // tmp12
    addu        t3, t8, t4      // tmp12 + tmp11
    addu        t3, t3, t7      // tmp10
    subu        t8, t8, t9      // tmp12 + tmp13
    addu        s0, t5, s0
    subu        t8, t8, s0      // tmp12
    subu        t9, t6, t9
    subu        s1, s1, t4
    addu        t9, t9, s1      // tmp13
    subu        t6, t6, t5
    subu        t6, t6, s2
    subu        t6, t6, s3      // tmp15
    // even part start
    lh          t4, 64(a1)
    lh          t5, 64(a0)
    lh          t7, 32(a1)
    lh          s0, 32(a0)
    lh          s1, 0(a1)
    lh          s2, 0(a0)
    lh          s3, 96(a1)
    lh          v0, 96(a0)
    mul         t4, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
    mul         t5, t7, s0      // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
    mul         t7, s1, s2      // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
    mul         s0, s3, v0      // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
    // odd part end
    addu        t1, t2, t1      // tmp11
    subu        t0, t2, t0      // tmp14
    // update counter and pointers
    addiu       a3, a3, -1
    addiu       a0, a0, 2
    addiu       a1, a1, 2
    // even part rest
    li          s1, 10033
    li          s2, 11190
    mul         t4, t4, s1      // z4
    mul         s1, t5, s2      // z4
    sll         t5, t5, 13      // z1
    sll         t7, t7, 13
    addiu       t7, t7, 1024    // z3
    sll         s0, s0, 13      // z2
    addu        s2, t7, t4      // tmp10
    subu        t4, t7, t4      // tmp11
    subu        s3, t5, s0      // tmp12
    addu        t2, t7, s3      // tmp21
    subu        s3, t7, s3      // tmp24
    addu        t7, s1, s0      // tmp12
    addu        v0, s2, t7      // tmp20
    subu        s2, s2, t7      // tmp25
    subu        s1, s1, t5      // z4 - z1
    subu        s1, s1, s0      // tmp12
    addu        s0, t4, s1      // tmp22
    subu        t4, t4, s1      // tmp23
    // final output stage
    addu        t5, v0, t3
    subu        v0, v0, t3
    addu        t3, t2, t1
    subu        t2, t2, t1
    addu        t1, s0, t8
    subu        s0, s0, t8
    addu        t8, t4, t9
    subu        t4, t4, t9
    addu        t9, s3, t0
    subu        s3, s3, t0
    addu        t0, s2, t6
    subu        s2, s2, t6
    sra         t5, t5, 11
    sra         t3, t3, 11
    sra         t1, t1, 11
    sra         t8, t8, 11
    sra         t9, t9, 11
    sra         t0, t0, 11
    sra         s2, s2, 11
    sra         s3, s3, 11
    sra         t4, t4, 11
    sra         s0, s0, 11
    sra         t2, t2, 11
    sra         v0, v0, 11
    sw          t5, 0(a2)
    sw          t3, 32(a2)
    sw          t1, 64(a2)
    sw          t8, 96(a2)
    sw          t9, 128(a2)
    sw          t0, 160(a2)
    sw          s2, 192(a2)
    sw          s3, 224(a2)
    sw          t4, 256(a2)
    sw          s0, 288(a2)
    sw          t2, 320(a2)
    sw          v0, 352(a2)
    bgtz        a3, 1b
     addiu      a2, a2, 4

    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

    j           ra
     nop

END(jsimd_idct_12x12_pass1_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
/*
 * a0 = workspace
 * a1 = output
 */
    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

    li          a3, 12

1:
    // Odd part
    lw          t0, 12(a0)
    lw          t1, 4(a0)
    lw          t2, 20(a0)
    lw          t3, 28(a0)
    li          t4, 10703       // FIX(1.306562965)
    li          t5, 4433        // FIX_0_541196100
    mul         t4, t0, t4      // tmp11
    mul         t5, t0, t5      // -tmp14
    addu        t6, t1, t2      // tmp10
    li          t7, 2139        // FIX(0.261052384)
    mul         t7, t6, t7      // MULTIPLY(tmp10, FIX(0.261052384))
    addu        t6, t6, t3      // tmp10 + z4
    li          t8, 7053        // FIX(0.860918669)
    mul         t6, t6, t8      // tmp15
    li          t8, 2295        // FIX(0.280143716)
    mul         t8, t1, t8      // MULTIPLY(z1, FIX(0.280143716))
    addu        t9, t2, t3      // z3 + z4
    li          s0, 8565        // FIX(1.045510580)
    mul         t9, t9, s0      // -tmp13
    li          s0, 12112       // FIX(1.478575242)
    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242))
    li          s1, 12998       // FIX(1.586706681)
    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
    li          s2, 5540        // FIX(0.676326758)
    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
    li          s3, 16244       // FIX(1.982889723)
    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
    subu        t1, t1, t3      // z1 -= z4
    subu        t0, t0, t2      // z2 -= z3
    addu        t2, t1, t0      // z1 + z2
    li          t3, 4433        // FIX_0_541196100
    mul         t2, t2, t3      // z3
    li          t3, 6270        // FIX_0_765366865
    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
    li          t3, 15137       // FIX_1_847759065
    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
    addu        t3, t6, t7      // tmp12
    addu        t7, t3, t4
    addu        t7, t7, t8      // tmp10
    subu        t3, t3, t9
    subu        t3, t3, t5
    subu        t3, t3, s0      // tmp12
    subu        t9, t6, t9
    subu        t9, t9, t4
    addu        t9, t9, s1      // tmp13
    subu        t6, t6, t5
    subu        t6, t6, s2
    subu        t6, t6, s3      // tmp15
    addu        t1, t2, t1      // tmp11
    subu        t0, t2, t0      // tmp14
    // even part
    lw          t2, 16(a0)      // z4
    lw          t4, 8(a0)       // z1
    lw          t5, 0(a0)       // z3
    lw          t8, 24(a0)      // z2
    li          s0, 10033       // FIX(1.224744871)
    li          s1, 11190       // FIX(1.366025404)
    mul         t2, t2, s0      // z4
    mul         s0, t4, s1      // z4
    addiu       t5, t5, 0x10
    sll         t5, t5, 13      // z3
    sll         t4, t4, 13      // z1
    sll         t8, t8, 13      // z2
    subu        s1, t4, t8      // tmp12
    addu        s2, t5, t2      // tmp10
    subu        t2, t5, t2      // tmp11
    addu        s3, t5, s1      // tmp21
    subu        s1, t5, s1      // tmp24
    addu        t5, s0, t8      // tmp12
    addu        v0, s2, t5      // tmp20
    subu        t5, s2, t5      // tmp25
    subu        t4, s0, t4
    subu        t4, t4, t8      // tmp12
    addu        t8, t2, t4      // tmp22
    subu        t2, t2, t4      // tmp23
    // increment counter and pointers
    addiu       a3, a3, -1
    addiu       a0, a0, 32
    // Final stage
    addu        t4, v0, t7
    subu        v0, v0, t7
    addu        t7, s3, t1
    subu        s3, s3, t1
    addu        t1, t8, t3
    subu        t8, t8, t3
    addu        t3, t2, t9
    subu        t2, t2, t9
    addu        t9, s1, t0
    subu        s1, s1, t0
    addu        t0, t5, t6
    subu        t5, t5, t6
    sll         t4, t4, 4
    sll         t7, t7, 4
    sll         t1, t1, 4
    sll         t3, t3, 4
    sll         t9, t9, 4
    sll         t0, t0, 4
    sll         t5, t5, 4
    sll         s1, s1, 4
    sll         t2, t2, 4
    sll         t8, t8, 4
    sll         s3, s3, 4
    sll         v0, v0, 4
    shll_s.w    t4, t4, 2
    shll_s.w    t7, t7, 2
    shll_s.w    t1, t1, 2
    shll_s.w    t3, t3, 2
    shll_s.w    t9, t9, 2
    shll_s.w    t0, t0, 2
    shll_s.w    t5, t5, 2
    shll_s.w    s1, s1, 2
    shll_s.w    t2, t2, 2
    shll_s.w    t8, t8, 2
    shll_s.w    s3, s3, 2
    shll_s.w    v0, v0, 2
    srl         t4, t4, 24
    srl         t7, t7, 24
    srl         t1, t1, 24
    srl         t3, t3, 24
    srl         t9, t9, 24
    srl         t0, t0, 24
    srl         t5, t5, 24
    srl         s1, s1, 24
    srl         t2, t2, 24
    srl         t8, t8, 24
    srl         s3, s3, 24
    srl         v0, v0, 24
    lw          t6, 0(a1)
    addiu       t4, t4, 0x80
    addiu       t7, t7, 0x80
    addiu       t1, t1, 0x80
    addiu       t3, t3, 0x80
    addiu       t9, t9, 0x80
    addiu       t0, t0, 0x80
    addiu       t5, t5, 0x80
    addiu       s1, s1, 0x80
    addiu       t2, t2, 0x80
    addiu       t8, t8, 0x80
    addiu       s3, s3, 0x80
    addiu       v0, v0, 0x80
    sb          t4, 0(t6)
    sb          t7, 1(t6)
    sb          t1, 2(t6)
    sb          t3, 3(t6)
    sb          t9, 4(t6)
    sb          t0, 5(t6)
    sb          t5, 6(t6)
    sb          s1, 7(t6)
    sb          t2, 8(t6)
    sb          t8, 9(t6)
    sb          s3, 10(t6)
    sb          v0, 11(t6)
    bgtz        a3, 1b
     addiu      a1, a1, 4

    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

    jr          ra
     nop

END(jsimd_idct_12x12_pass2_dspr2)


/*****************************************************************************/
LEAF_DSPR2(jsimd_convsamp_dspr2)
/*
 * a0 = sample_data
 * a1 = start_col
 * a2 = workspace
 */
    lw            t0, 0(a0)
    li            t7, 0xff80ff80
    addu          t0, t0, a1
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    lw            t0, 4(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 0(a2)
    usw           t4, 4(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 8(a2)
    usw           t6, 12(a2)

    lw            t0, 8(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 16(a2)
    usw           t4, 20(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 24(a2)
    usw           t6, 28(a2)

    lw            t0, 12(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 32(a2)
    usw           t4, 36(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 40(a2)
    usw           t6, 44(a2)

    lw            t0, 16(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 48(a2)
    usw           t4, 52(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 56(a2)
    usw           t6, 60(a2)

    lw            t0, 20(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 64(a2)
    usw           t4, 68(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 72(a2)
    usw           t6, 76(a2)

    lw            t0, 24(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 80(a2)
    usw           t4, 84(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 88(a2)
    usw           t6, 92(a2)

    lw            t0, 28(a0)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu          t0, t0, a1
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    ulw           t1, 0(t0)
    ulw           t2, 4(t0)
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 96(a2)
    usw           t4, 100(a2)
    preceu.ph.qbr t3, t1
    preceu.ph.qbl t4, t1
    usw           t5, 104(a2)
    usw           t6, 108(a2)
    preceu.ph.qbr t5, t2
    preceu.ph.qbl t6, t2
    addu.ph       t3, t3, t7
    addu.ph       t4, t4, t7
    addu.ph       t5, t5, t7
    addu.ph       t6, t6, t7
    usw           t3, 112(a2)
    usw           t4, 116(a2)
    usw           t5, 120(a2)
    usw           t6, 124(a2)

    j             ra
     nop

END(jsimd_convsamp_dspr2)


#ifndef __mips_soft_float

/*****************************************************************************/
LEAF_DSPR2(jsimd_convsamp_float_dspr2)
/*
 * a0 = sample_data
 * a1 = start_col
 * a2 = workspace
 */
    .set at

    lw          t0, 0(a0)
    addu        t0, t0, a1
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 4(a0)
    swc1        f2, 0(a2)
    swc1        f4, 4(a2)
    swc1        f6, 8(a2)
    addu        t0, t0, a1
    swc1        f8, 12(a2)
    swc1        f10, 16(a2)
    swc1        f12, 20(a2)
    swc1        f14, 24(a2)
    swc1        f16, 28(a2)
    // elemr 1
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 8(a0)
    swc1        f2, 32(a2)
    swc1        f4, 36(a2)
    swc1        f6, 40(a2)
    addu        t0, t0, a1
    swc1        f8, 44(a2)
    swc1        f10, 48(a2)
    swc1        f12, 52(a2)
    swc1        f14, 56(a2)
    swc1        f16, 60(a2)
    // elemr 2
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 12(a0)
    swc1        f2, 64(a2)
    swc1        f4, 68(a2)
    swc1        f6, 72(a2)
    addu        t0, t0, a1
    swc1        f8, 76(a2)
    swc1        f10, 80(a2)
    swc1        f12, 84(a2)
    swc1        f14, 88(a2)
    swc1        f16, 92(a2)
    //  elemr 3
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 16(a0)
    swc1        f2, 96(a2)
    swc1        f4, 100(a2)
    swc1        f6, 104(a2)
    addu        t0, t0, a1
    swc1        f8, 108(a2)
    swc1        f10, 112(a2)
    swc1        f12, 116(a2)
    swc1        f14, 120(a2)
    swc1        f16, 124(a2)
    // elemr 4
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 20(a0)
    swc1        f2, 128(a2)
    swc1        f4, 132(a2)
    swc1        f6, 136(a2)
    addu        t0, t0, a1
    swc1        f8, 140(a2)
    swc1        f10, 144(a2)
    swc1        f12, 148(a2)
    swc1        f14, 152(a2)
    swc1        f16, 156(a2)
    // elemr 5
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 24(a0)
    swc1        f2, 160(a2)
    swc1        f4, 164(a2)
    swc1        f6, 168(a2)
    addu        t0, t0, a1
    swc1        f8, 172(a2)
    swc1        f10, 176(a2)
    swc1        f12, 180(a2)
    swc1        f14, 184(a2)
    swc1        f16, 188(a2)
    // elemr 6
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    lw          t0, 28(a0)
    swc1        f2, 192(a2)
    swc1        f4, 196(a2)
    swc1        f6, 200(a2)
    addu        t0, t0, a1
    swc1        f8, 204(a2)
    swc1        f10, 208(a2)
    swc1        f12, 212(a2)
    swc1        f14, 216(a2)
    swc1        f16, 220(a2)
    // elemr 7
    lbu         t1, 0(t0)
    lbu         t2, 1(t0)
    lbu         t3, 2(t0)
    lbu         t4, 3(t0)
    lbu         t5, 4(t0)
    lbu         t6, 5(t0)
    lbu         t7, 6(t0)
    lbu         t8, 7(t0)
    addiu       t1, t1, -128
    addiu       t2, t2, -128
    addiu       t3, t3, -128
    addiu       t4, t4, -128
    addiu       t5, t5, -128
    addiu       t6, t6, -128
    addiu       t7, t7, -128
    addiu       t8, t8, -128
    mtc1        t1, f2
    mtc1        t2, f4
    mtc1        t3, f6
    mtc1        t4, f8
    mtc1        t5, f10
    mtc1        t6, f12
    mtc1        t7, f14
    mtc1        t8, f16
    cvt.s.w     f2, f2
    cvt.s.w     f4, f4
    cvt.s.w     f6, f6
    cvt.s.w     f8, f8
    cvt.s.w     f10, f10
    cvt.s.w     f12, f12
    cvt.s.w     f14, f14
    cvt.s.w     f16, f16
    swc1        f2, 224(a2)
    swc1        f4, 228(a2)
    swc1        f6, 232(a2)
    swc1        f8, 236(a2)
    swc1        f10, 240(a2)
    swc1        f12, 244(a2)
    swc1        f14, 248(a2)
    swc1        f16, 252(a2)

    j           ra
     nop

END(jsimd_convsamp_float_dspr2)

#endif

/*****************************************************************************/