#
# Copyright (C) 2011 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# IDCT implementation using the MIPS DSP ASE (little endian version)
#
# See MIPS Technologies Inc documents:
# "JPEG Decoder Optimization for MIPS32(R) Cores"  MD00483
#
# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
#       Application Specifice Extension to the MIPS32(R) Architecture" MD00374
#

        .set            noreorder
        .set            nomacro
        .set            noat

# This table has been moved to mips_jidctfst.c to avoid having to mess
# with the global pointer to make this code PIC.
#       .rdata
#
# mips_idct_coefs:
#       # Constant table of scaled IDCT coefficients.
#
#       .word           0x45464546              # FIX( 1.082392200 / 2) =  17734 = 0x4546
#       .word           0x5A825A82              # FIX( 1.414213562 / 2) =  23170 = 0x5A82
#       .word           0x76427642              # FIX( 1.847759065 / 2) =  30274 = 0x7642
#       .word           0xAC61AC61              # FIX(-2.613125930 / 4) = -21407 = 0xAC61

        .text

        .global         mips_idct_columns
        .ent            mips_idct_columns

# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
#                        DCTELEM * wsptr, const int * mips_idct_coefs);

mips_idct_columns:

# $a0   - inptr
# $a1   - quantptr
# $a2   - wsptr
# $a3, $at   - mips_idct_coefs
# $t0:7 - simd data
# $t8   - coefficients, temp
# $t9   - loop end address
# $s0:3 - simd quantization factors
# $s4:7 - temp results
# $v0:1 - temp results

        addiu           $sp, $sp, -32           # reserve stack space for s0-s7

        sw              $s0, 28($sp)
        sw              $s1, 24($sp)
        sw              $s2, 20($sp)
        sw              $s3, 16($sp)
        sw              $s4, 12($sp)
        sw              $s5,  8($sp)
        sw              $s6,  4($sp)
        sw              $s7,  0($sp)

        addiu           $t9, $a0, 16            # end address

        #lui            $at, %hi(mips_idct_coefs)
        #ori            $at, %lo(mips_idct_coefs)
        # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
        or              $at, $a3, $zero

loop_columns:

        lw              $s0, 0($a1)             # quantptr[DCTSIZE*0]

        lw              $t0, 0($a0)             # inptr[DCTSIZE*0]
        lw              $t1, 16($a0)            # inptr[DCTSIZE*1]

        muleq_s.w.phl   $v0, $t0, $s0           # tmp0 ...

        lw              $t2, 32($a0)            # inptr[DCTSIZE*2]
        lw              $t3, 48($a0)            # inptr[DCTSIZE*3]
        lw              $t4, 64($a0)            # inptr[DCTSIZE*4]
        lw              $t5, 80($a0)            # inptr[DCTSIZE*5]

        muleq_s.w.phr   $t0, $t0, $s0           # ... tmp0 ...

        lw              $t6, 96($a0)            # inptr[DCTSIZE*6]
        lw              $t7, 112($a0)           # inptr[DCTSIZE*7]

        or              $s4, $t1, $t2
        or              $s5, $t3, $t4

        bnez            $s4, full_column
        ins             $t0, $v0, 16, 16        # ... tmp0

        bnez            $s5, full_column
        or              $s6, $t5, $t6
        or              $s6, $s6, $t7
        bnez            $s6, full_column

        sw              $t0, 0($a2)             # wsptr[DCTSIZE*0]
        sw              $t0, 16($a2)            # wsptr[DCTSIZE*1]
        sw              $t0, 32($a2)            # wsptr[DCTSIZE*2]
        sw              $t0, 48($a2)            # wsptr[DCTSIZE*3]
        sw              $t0, 64($a2)            # wsptr[DCTSIZE*4]
        sw              $t0, 80($a2)            # wsptr[DCTSIZE*5]
        sw              $t0, 96($a2)            # wsptr[DCTSIZE*6]
        sw              $t0, 112($a2)           # wsptr[DCTSIZE*7]

        addiu           $a0, $a0, 4

        b               continue_columns
        addiu           $a1, $a1, 4


full_column:

        lw              $s1, 32($a1)            # quantptr[DCTSIZE*2]
        lw              $s2, 64($a1)            # quantptr[DCTSIZE*4]

        muleq_s.w.phl   $v0, $t2, $s1           # tmp1 ...
        muleq_s.w.phr   $t2, $t2, $s1           # ... tmp1 ...

        lw              $s0, 16($a1)            # quantptr[DCTSIZE*1]
        lw              $s1, 48($a1)            # quantptr[DCTSIZE*3]
        lw              $s3, 96($a1)            # quantptr[DCTSIZE*6]

        muleq_s.w.phl   $v1, $t4, $s2           # tmp2 ...
        muleq_s.w.phr   $t4, $t4, $s2           # ... tmp2 ...

        lw              $s2, 80($a1)            # quantptr[DCTSIZE*5]
        lw              $t8, 4($at)             # FIX(1.414213562)
        ins             $t2, $v0, 16, 16        # ... tmp1

        muleq_s.w.phl   $v0, $t6, $s3           # tmp3 ...
        muleq_s.w.phr   $t6, $t6, $s3           # ... tmp3 ...

        ins             $t4, $v1, 16, 16        # ... tmp2

        addq.ph         $s4, $t0, $t4           # tmp10
        subq.ph         $s5, $t0, $t4           # tmp11

        ins             $t6, $v0, 16, 16        # ... tmp3

        subq.ph         $s6, $t2, $t6           # tmp12 ...
        addq.ph         $s7, $t2, $t6           # tmp13

        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...

        addq.ph         $t0, $s4, $s7           # tmp0
        subq.ph         $t6, $s4, $s7           # tmp3

################

        muleq_s.w.phl   $v0, $t1, $s0           # tmp4 ...
        muleq_s.w.phr   $t1, $t1, $s0           # ... tmp4 ...

        shll_s.ph       $s6, $s6, 1             # x2

        lw              $s3, 112($a1)           # quantptr[DCTSIZE*7]

        subq.ph         $s6, $s6, $s7           # ... tmp12

        muleq_s.w.phl   $v1, $t7, $s3           # tmp7 ...
        muleq_s.w.phr   $t7, $t7, $s3           # ... tmp7 ...

        ins             $t1, $v0, 16, 16        # ... tmp4

        addq.ph         $t2, $s5, $s6           # tmp1
        subq.ph         $t4, $s5, $s6           # tmp2

        muleq_s.w.phl   $v0, $t5, $s2           # tmp6 ...
        muleq_s.w.phr   $t5, $t5, $s2           # ... tmp6 ...

        ins             $t7, $v1, 16, 16        # ... tmp7

        addq.ph         $s5, $t1, $t7           # z11
        subq.ph         $s6, $t1, $t7           # z12

        muleq_s.w.phl   $v1, $t3, $s1           # tmp5 ...
        muleq_s.w.phr   $t3, $t3, $s1           # ... tmp5 ...

        ins             $t5, $v0, 16, 16        # ... tmp6

# stalls

        ins             $t3, $v1, 16, 16        # ... tmp5


        addq.ph         $s7, $t5, $t3           # z13
        subq.ph         $v0, $t5, $t3           # z10

        addq.ph         $t7, $s5, $s7           # tmp7
        subq.ph         $s5, $s5, $s7           # tmp11 ...

        addq.ph         $v1, $v0, $s6           # z5 ...

        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11

        lw              $t8, 8($at)             # FIX(1.847759065)
        lw              $s4, 0($at)             # FIX(1.082392200)

        addq.ph         $s0, $t0, $t7
        subq.ph         $s1, $t0, $t7

        mulq_rs.ph      $v1, $v1, $t8           # ... z5

        shll_s.ph       $s5, $s5, 1             # x2

        lw              $t8, 12($at)            # FIX(-2.613125930)
        sw              $s0, 0($a2)             # wsptr[DCTSIZE*0]

        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...

        shll_s.ph       $v1, $v1, 1             # x2

        addiu           $a0, $a0, 4
        addiu           $a1, $a1, 4

        sw              $s1, 112($a2)           # wsptr[DCTSIZE*7]

        shll_s.ph       $s6, $v0, 2             # x4
        shll_s.ph       $s4, $s4, 1             # x2
        addq.ph         $s6, $s6, $v1           # ... tmp12

        subq.ph         $t5, $s6, $t7           # tmp6
        subq.ph         $s4, $s4, $v1           # ... tmp10
        subq.ph         $t3, $s5, $t5           # tmp5
        addq.ph         $s2, $t2, $t5
        addq.ph         $t1, $s4, $t3           # tmp4
        subq.ph         $s3, $t2, $t5

        sw              $s2, 16($a2)            # wsptr[DCTSIZE*1]
        sw              $s3, 96($a2)            # wsptr[DCTSIZE*6]

        addq.ph         $v0, $t4, $t3
        subq.ph         $v1, $t4, $t3

        sw              $v0, 32($a2)            # wsptr[DCTSIZE*2]
        sw              $v1, 80($a2)            # wsptr[DCTSIZE*5]

        addq.ph         $v0, $t6, $t1
        subq.ph         $v1, $t6, $t1

        sw              $v0, 64($a2)            # wsptr[DCTSIZE*4]
        sw              $v1, 48($a2)            # wsptr[DCTSIZE*3]

continue_columns:

        bne             $a0, $t9, loop_columns
        addiu           $a2, $a2, 4


        lw              $s0, 28($sp)
        lw              $s1, 24($sp)
        lw              $s2, 20($sp)
        lw              $s3, 16($sp)
        lw              $s4, 12($sp)
        lw              $s5,  8($sp)
        lw              $s6,  4($sp)
        lw              $s7,  0($sp)

        jr              $ra
        addiu           $sp, $sp, 32


        .end            mips_idct_columns


##################################################################


        .global         mips_idct_rows
        .ent            mips_idct_rows

# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
#                     JDIMENSION output_col, const int * mips_idct_coefs);

mips_idct_rows:

# $a0   - wsptr
# $a1   - output_buf
# $a2   - output_col
# $a3   - outptr
# $a3, $at   - mips_idct_coefs
# $t0:7 - simd data
# $t8   - coefficients, temp
# $t9   - loop end address
# $s0:3 - simd quantization factors
# $s4:7 - temp results
# s8    - const 0x80808080
# $v0:1 - temp results

SHIFT   =               2

        addiu           $sp, $sp, -48           # reserve stack space for s0-s8

        # save $a3 (mips_idct_coefs) because it might get clobbered below
        sw              $a3, 36($sp)

        sw              $s0, 32($sp)
        sw              $s1, 28($sp)
        sw              $s2, 24($sp)
        sw              $s3, 20($sp)
        sw              $s4, 16($sp)
        sw              $s5, 12($sp)
        sw              $s6,  8($sp)
        sw              $s7,  4($sp)
        sw              $s8,  0($sp)

        addiu           $t9, $a0, 128           # end address

        lui             $s8, 0x8080
        ori             $s8, $s8, 0x8080

loop_rows:

        lw              $at, 36($sp)            # restore saved $a3 (mips_idct_coefs)

        lw              $t0, 0+0($a0)           # wsptr[DCTSIZE*0+0/1]  b a
        lw              $s0, 16+0($a0)          # wsptr[DCTSIZE*1+0/1]  B A
        lw              $t2, 0+4($a0)           # wsptr[DCTSIZE*0+2/3]  d c
        lw              $s2, 16+4($a0)          # wsptr[DCTSIZE*1+2/3]  D C
        lw              $t4, 0+8($a0)           # wsptr[DCTSIZE*0+4/5]  f e
        lw              $s4, 16+8($a0)          # wsptr[DCTSIZE*1+4/5]  F E
        lw              $t6, 0+12($a0)          # wsptr[DCTSIZE*0+6/7]  h g
        lw              $s6, 16+12($a0)         # wsptr[DCTSIZE*1+6/7]  H G

        precrq.ph.w     $t1, $s0, $t0           # B b
        ins             $t0, $s0, 16, 16        # A a

        bnez            $t1, full_row
        or              $s0, $t2, $s2
        bnez            $s0, full_row
        or              $s0, $t4, $s4
        bnez            $s0, full_row
        or              $s0, $t6, $s6
        bnez            $s0, full_row

        shll_s.ph       $s0, $t0, SHIFT         # A a

        lw              $a3, 0($a1)
        lw              $at, 4($a1)

        precrq.ph.w     $t0, $s0, $s0           # A A
        ins             $s0, $s0, 16, 16        # a a

        addu            $a3, $a3, $a2
        addu            $at, $at, $a2

        precrq.qb.ph    $t0, $t0, $t0           # A A A A
        precrq.qb.ph    $s0, $s0, $s0           # a a a a


        addu.qb         $s0, $s0, $s8
        addu.qb         $t0, $t0, $s8


        sw              $s0, 0($a3)
        sw              $s0, 4($a3)

        sw              $t0, 0($at)
        sw              $t0, 4($at)


        addiu           $a0, $a0, 32

        bne             $a0, $t9, loop_rows
        addiu           $a1, $a1, 8

        b               exit_rows
        nop


full_row:

        precrq.ph.w     $t3, $s2, $t2
        ins             $t2, $s2, 16, 16

        precrq.ph.w     $t5, $s4, $t4
        ins             $t4, $s4, 16, 16

        precrq.ph.w     $t7, $s6, $t6
        ins             $t6, $s6, 16, 16


        lw              $t8, 4($at)             # FIX(1.414213562)

        addq.ph         $s4, $t0, $t4           # tmp10
        subq.ph         $s5, $t0, $t4           # tmp11

        subq.ph         $s6, $t2, $t6           # tmp12 ...
        addq.ph         $s7, $t2, $t6           # tmp13

        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...

        addq.ph         $t0, $s4, $s7           # tmp0
        subq.ph         $t6, $s4, $s7           # tmp3

        shll_s.ph       $s6, $s6, 1             # x2

        subq.ph         $s6, $s6, $s7           # ... tmp12

        addq.ph         $t2, $s5, $s6           # tmp1
        subq.ph         $t4, $s5, $s6           # tmp2

################

        addq.ph         $s5, $t1, $t7           # z11
        subq.ph         $s6, $t1, $t7           # z12

        addq.ph         $s7, $t5, $t3           # z13
        subq.ph         $v0, $t5, $t3           # z10

        addq.ph         $t7, $s5, $s7           # tmp7
        subq.ph         $s5, $s5, $s7           # tmp11 ...

        addq.ph         $v1, $v0, $s6           # z5 ...

        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11

        lw              $t8, 8($at)             # FIX(1.847759065)
        lw              $s4, 0($at)             # FIX(1.082392200)

        addq.ph         $s0, $t0, $t7           # tmp0 + tmp7
        subq.ph         $s7, $t0, $t7           # tmp0 - tmp7

        mulq_rs.ph      $v1, $v1, $t8           # ... z5

        lw              $a3, 0($a1)
        lw              $t8, 12($at)            # FIX(-2.613125930)

        shll_s.ph       $s5, $s5, 1             # x2

        addu            $a3, $a3, $a2

        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...

        shll_s.ph       $v1, $v1, 1             # x2

        addiu           $a0, $a0, 32
        addiu           $a1, $a1, 8


        shll_s.ph       $s6, $v0, 2             # x4
        shll_s.ph       $s4, $s4, 1             # x2
        addq.ph         $s6, $s6, $v1           # ... tmp12

        shll_s.ph       $s0, $s0, SHIFT

        subq.ph         $t5, $s6, $t7           # tmp6
        subq.ph         $s4, $s4, $v1           # ... tmp10
        subq.ph         $t3, $s5, $t5           # tmp5

        shll_s.ph       $s7, $s7, SHIFT

        addq.ph         $t1, $s4, $t3           # tmp4


        addq.ph         $s1, $t2, $t5           # tmp1 + tmp6
        subq.ph         $s6, $t2, $t5           # tmp1 - tmp6

        addq.ph         $s2, $t4, $t3           # tmp2 + tmp5
        subq.ph         $s5, $t4, $t3           # tmp2 - tmp5

        addq.ph         $s4, $t6, $t1           # tmp3 + tmp4
        subq.ph         $s3, $t6, $t1           # tmp3 - tmp4


        shll_s.ph       $s1, $s1, SHIFT
        shll_s.ph       $s2, $s2, SHIFT
        shll_s.ph       $s3, $s3, SHIFT
        shll_s.ph       $s4, $s4, SHIFT
        shll_s.ph       $s5, $s5, SHIFT
        shll_s.ph       $s6, $s6, SHIFT


        precrq.ph.w     $t0, $s1, $s0           # B A
        ins             $s0, $s1, 16, 16        # b a

        precrq.ph.w     $t2, $s3, $s2           # D C
        ins             $s2, $s3, 16, 16        # d c

        precrq.ph.w     $t4, $s5, $s4           # F E
        ins             $s4, $s5, 16, 16        # f e

        precrq.ph.w     $t6, $s7, $s6           # H G
        ins             $s6, $s7, 16, 16        # h g

        precrq.qb.ph    $t0, $t2, $t0           # D C B A
        precrq.qb.ph    $s0, $s2, $s0           # d c b a

        precrq.qb.ph    $t4, $t6, $t4           # H G F E
        precrq.qb.ph    $s4, $s6, $s4           # h g f e


        addu.qb         $s0, $s0, $s8
        addu.qb         $s4, $s4, $s8


        sw              $s0, 0($a3)             # outptr[0/1/2/3]       d c b a
        sw              $s4, 4($a3)             # outptr[4/5/6/7]       h g f e

        lw              $a3, -4($a1)

        addu.qb         $t0, $t0, $s8

        addu            $a3, $a3, $a2

        addu.qb         $t4, $t4, $s8


        sw              $t0, 0($a3)             # outptr[0/1/2/3]       D C B A

        bne             $a0, $t9, loop_rows
        sw              $t4, 4($a3)             # outptr[4/5/6/7]       H G F E


exit_rows:

        lw              $s0, 32($sp)
        lw              $s1, 28($sp)
        lw              $s2, 24($sp)
        lw              $s3, 20($sp)
        lw              $s4, 16($sp)
        lw              $s5, 12($sp)
        lw              $s6,  8($sp)
        lw              $s7,  4($sp)
        lw              $s8,  0($sp)

        jr              $ra
        addiu           $sp, $sp, 48


        .end            mips_idct_rows