/* libs/pixelflinger/t32cb16blend.S
**
** Copyright 2010, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
**     http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/

#ifdef DEBUG
#define DBG
#else
#define DBG #
#endif

/*
 * blend one of 2 16bpp RGB pixels held in dreg selected by shift
 * with the 32bpp ABGR pixel held in src and store the result in fb
 *
 * Assumes that the dreg data is little endian and that
 * the the second pixel (shift==16) will be merged into
 * the fb result
 *
 * Uses $t0,$t6,$t7,$t8
 */

#if __mips==32 && __mips_isa_rev>=2
	.macro pixel dreg src fb shift
	/*
	 * sA = s >> 24
	 * f = 0x100 - (sA + (sA>>7))
	 */
DBG	.set	noat
DBG	rdhwr	$at,$2
DBG	.set	at

	srl	$t7,\src,24
	srl	$t6,$t7,7
	addu	$t7,$t6
	li	$t6,0x100
	subu	$t7,$t6,$t7

	/* red */
	ext	$t8,\dreg,\shift+6+5,5			# dst[\shift:15..11]
	mul	$t6,$t8,$t7
	ext	$t0,\dreg,\shift+5,6			# start green extraction dst[\shift:10..5]
	ext	$t8,\src,3,5				# src[7..3]
	srl	$t6,8
	addu	$t8,$t6
	ins	\fb,$t8,\shift+6+5,5			# dst[\shift:15..11]

        /* green */
	mul	$t8,$t0,$t7
	ext	$t0,\dreg,\shift,5			# start blue extraction dst[\shift:4..0]
	ext	$t6,\src,2+8,6				# src[15..10]
	srl	$t8,8
        addu	$t8,$t6

	/* blue */
	mul	$t0,$t0,$t7
	ins	\fb,$t8,\shift+5,6			# finish green insertion dst[\shift:10..5]
	ext	$t6,\src,(3+8+8),5
	srl	$t8,$t0,8
	addu	$t8,$t6
	ins	\fb,$t8,\shift,5

DBG	.set	noat
DBG	rdhwr	$t8,$2
DBG	subu	$t8,$at
DBG	sltu	$at,$t8,$v0
DBG	movn	$v0,$t8,$at
DBG	sgtu	$at,$t8,$v1
DBG	movn	$v1,$t8,$at
DBG	.set	at
	.endm

#else

	.macro pixel dreg src fb shift
	/*
	 * sA = s >> 24
	 * f = 0x100 - (sA + (sA>>7))
	 */
DBG	.set	push
DBG	.set	noat
DBG	.set	mips32r2
DBG 	rdhwr	$at,$2
DBG	.set	pop

	srl	$t7,\src,24
	srl	$t6,$t7,7
	addu	$t7,$t6
	li	$t6,0x100
	subu	$t7,$t6,$t7

	/*
	 * red
	 * dR = (d >> (6 + 5)) & 0x1f;
	 * dR = (f*dR)>>8
	 * sR = (s >> (   3)) & 0x1f;
	 * sR += dR
	 * fb |= sR << 11
	 */
	srl	$t8,\dreg,\shift+6+5
.if \shift==0
	and     $t8,0x1f
.endif
	mul	$t8,$t8,$t7
	srl	$t6,\src,3
	and	$t6,0x1f
	srl	$t8,8
	addu	$t8,$t6
.if \shift!=0
	sll	$t8,\shift+11
	or	\fb,$t8
.else
	sll	\fb,$t8,11
.endif

        /*
	 * green
	 * dG = (d >> 5) & 0x3f
	 * dG = (f*dG) >> 8
	 * sG = (s >> ( 8+2))&0x3F;
	 */
	srl	$t8,\dreg,\shift+5
        and	$t8,0x3f
	mul	$t8,$t8,$t7
        srl	$t6,\src,8+2
        and     $t6,0x3f
	srl	$t8,8
        addu	$t8,$t6
	sll	$t8,\shift + 5
	or	\fb,$t8

	/* blue */
.if \shift!=0
	srl	$t8,\dreg,\shift
	and	$t8,0x1f
.else
	and	$t8,\dreg,0x1f
.endif
	mul	$t8,$t8,$t7
	srl	$t6,\src,(8+8+3)
	and	$t6,0x1f
	srl	$t8,8
	addu	$t8,$t6
.if \shift!=0
	sll	$t8,\shift
.endif
	or	\fb,$t8
DBG	.set	push
DBG	.set	noat
DBG	.set	mips32r2
DBG	rdhwr	$t8,$2
DBG	subu	$t8,$at
DBG	sltu	$at,$t8,$v0
DBG	movn	$v0,$t8,$at
DBG	sgtu	$at,$t8,$v1
DBG	movn	$v1,$t8,$at
DBG	.set	pop
	.endm
#endif

	.text
	.align

	.global scanline_t32cb16blend_mips
	.ent	scanline_t32cb16blend_mips
scanline_t32cb16blend_mips:
DBG	li	$v0,0xffffffff
DBG	li	$v1,0
	/* Align the destination if necessary */
	and	$t0,$a0,3
	beqz	$t0,aligned

	/* as long as there is at least one pixel */
	beqz	$a2,done

	lw	$t4,($a1)
	addu	$a0,2
	addu	$a1,4
	beqz	$t4,1f
	lhu	$t3,-2($a0)
	pixel   $t3,$t4,$t1,0
	sh	$t1,-2($a0)
1:	subu	$a2,1

aligned:
	/* Check to see if its worth unrolling the loop */
	subu	$a2,4
	bltz	$a2,tail

	/* Process 4 pixels at a time */
fourpixels:
	/* 1st pair of pixels */
	lw	$t4,0($a1)
	lw	$t5,4($a1)
	addu	$a0,8
	addu	$a1,16

	/* both are zero, skip this pair */
	or	$t3,$t4,$t5
	beqz	$t3,1f

	/* load the destination */
	lw	$t3,-8($a0)

	pixel	$t3,$t4,$t1,0
	pixel	$t3,$t5,$t1,16
	sw	$t1,-8($a0)

1:
	/* 2nd pair of pixels */
	lw	$t4,-8($a1)
	lw	$t5,-4($a1)

	/* both are zero, skip this pair */
	or	$t3,$t4,$t5
	beqz	$t3,1f

	/* load the destination */
	lw	$t3,-4($a0)

	pixel	$t3,$t4,$t1,0
	pixel	$t3,$t5,$t1,16
	sw	$t1,-4($a0)

1:	subu    $a2,4
	bgtz	$a2,fourpixels

tail:
	/* the pixel count underran, restore it now */
	addu	$a2,4

	/* handle the last 0..3 pixels */
	beqz	$a2,done
onepixel:
	lw	$t4,($a1)
	addu	$a0,2
	addu	$a1,4
	beqz	$t4,1f
	lhu	$t3,-2($a0)
	pixel   $t3,$t4,$t1,0
	sh	$t1,-2($a0)
1:	subu	$a2,1
	bnez	$a2,onepixel
done:
DBG	.set    push
DBG	.set    mips32r2
DBG 	rdhwr	$a0,$3
DBG 	mul	$v0,$a0
DBG 	mul	$v1,$a0
DBG	.set    pop
	j	$ra
	.end	scanline_t32cb16blend_mips