/*
 * (C) Copyright IBM Corporation 2004
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
/**
 * \file read_rgba_span_x86.S
 * Optimized routines to transfer pixel data from the framebuffer to a
 * buffer in main memory.
 *
 * \author Ian Romanick <idr@us.ibm.com>
 */

	.file	"read_rgba_span_x86.S"
#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
/* Kevin F. Quinn 2nd July 2006
 * Replaced data segment constants with text-segment instructions.
 */
#define	LOAD_MASK(mvins,m1,m2) \
   	pushl	$0xff00ff00 ;\
   	pushl	$0xff00ff00 ;\
   	pushl	$0xff00ff00 ;\
   	pushl	$0xff00ff00 ;\
	mvins	(%esp), m1	;\
   	pushl	$0x00ff0000 ;\
   	pushl	$0x00ff0000 ;\
   	pushl	$0x00ff0000 ;\
   	pushl	$0x00ff0000 ;\
	mvins	(%esp), m2	;\
	addl	$32, %esp

/* I implemented these as macros because they appear in several places,
 * and I've tweaked them a number of times.  I got tired of changing every
 * place they appear. :)
 */

#define DO_ONE_PIXEL() \
	movl	(%ebx), %eax ; \
	addl	$4, %ebx ; \
	bswap	%eax          /* ARGB -> BGRA */ ; \
	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
	addl	$4, %ecx

#define DO_ONE_LAST_PIXEL() \
	movl	(%ebx), %eax ; \
	bswap	%eax          /* ARGB -> BGRA */ ; \
	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \


/**
 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
 * 
 * \warning
 * This function assumes that the caller will issue the EMMS instruction
 * at the correct places.
 */

.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
#endif
	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
	pushl	%ebx

#ifdef USE_INNER_EMMS
	emms
#endif
	LOAD_MASK(movq,%mm1,%mm2)

	movl	8(%esp), %ebx	/* source pointer */
	movl	16(%esp), %edx	/* number of pixels to copy */
	movl	12(%esp), %ecx	/* destination pointer */

	testl	%edx, %edx
	jle	.L20		/* Bail if there's nothing to do. */

	movl	%ebx, %eax

	negl	%eax
	sarl	$2, %eax
	andl	$1, %eax
	je	.L17

	subl	%eax, %edx
	DO_ONE_PIXEL()
.L17:

	/* Would it be faster to unroll this loop once and process 4 pixels
	 * per pass, instead of just two?
	 */

	movl	%edx, %eax
	shrl	%eax
	jmp	.L18
.L19:
	movq	(%ebx), %mm0
	addl	$8, %ebx

	/* These 9 instructions do what PSHUFB (if there were such an
	 * instruction) could do in 1. :(
	 */

	movq	%mm0, %mm3
	movq	%mm0, %mm4

	pand	%mm2, %mm3
	psllq	$16, %mm4
	psrlq	$16, %mm3
	pand	%mm2, %mm4

	pand	%mm1, %mm0
	por	%mm4, %mm3
	por	%mm3, %mm0

	movq	%mm0, (%ecx)
	addl	$8, %ecx
	subl	$1, %eax
.L18:
	jne	.L19

#ifdef USE_INNER_EMMS
	emms
#endif

	/* At this point there are either 1 or 0 pixels remaining to be
	 * converted.  Convert the last pixel, if needed.
	 */

	testl	$1, %edx
	je	.L20

	DO_ONE_LAST_PIXEL()

.L20:
	popl	%ebx
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX


/**
 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 * instructions are only actually used to read data from the framebuffer.
 * In practice, the speed-up is pretty small.
 *
 * \todo
 * Do some more testing and determine if there's any reason to have this
 * function in addition to the MMX version.
 *
 * \warning
 * This function assumes that the caller will issue the EMMS instruction
 * at the correct places.
 */

.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
#endif
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE:
	pushl	%esi
	pushl	%ebx
	pushl	%ebp

#ifdef USE_INNER_EMMS
	emms
#endif

	LOAD_MASK(movq,%mm1,%mm2)

	movl	16(%esp), %ebx	/* source pointer */
	movl	24(%esp), %edx	/* number of pixels to copy */
	movl	20(%esp), %ecx	/* destination pointer */

	testl	%edx, %edx
	jle	.L35		/* Bail if there's nothing to do. */

	movl	%esp, %ebp
	subl	$16, %esp
	andl	$0xfffffff0, %esp

	movl	%ebx, %eax
	movl	%edx, %esi

	negl	%eax
	andl	$15, %eax
	sarl	$2, %eax
	cmpl	%edx, %eax
	cmovle	%eax, %esi

	subl	%esi, %edx

	testl	$1, %esi
	je	.L32

	DO_ONE_PIXEL()
.L32:

	testl	$2, %esi
	je	.L31

	movq	(%ebx), %mm0
	addl	$8, %ebx

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	
	pand	%mm2, %mm3
	psllq	$16, %mm4
	psrlq	$16, %mm3
	pand	%mm2, %mm4

	pand	%mm1, %mm0
	por	%mm4, %mm3
	por	%mm3, %mm0

	movq	%mm0, (%ecx)
	addl	$8, %ecx
.L31:

	movl	%edx, %eax
	shrl	$2, %eax
	jmp	.L33
.L34:
	movaps	(%ebx), %xmm0
	addl	$16, %ebx

	/* This would be so much better if we could just move directly from
	 * an SSE register to an MMX register.  Unfortunately, that
	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
	 * instruction.
	 */

	movaps	%xmm0, (%esp)
	movq	(%esp), %mm0
	movq	8(%esp), %mm5

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	movq	%mm5, %mm6
	movq	%mm5, %mm7

	pand	%mm2, %mm3
	pand	%mm2, %mm6

	psllq	$16, %mm4
	psllq	$16, %mm7

	psrlq	$16, %mm3
	psrlq	$16, %mm6

	pand	%mm2, %mm4
	pand	%mm2, %mm7

	pand	%mm1, %mm0
	pand	%mm1, %mm5

	por	%mm4, %mm3
	por	%mm7, %mm6

	por	%mm3, %mm0
	por	%mm6, %mm5

	movq	%mm0, (%ecx)
	movq	%mm5, 8(%ecx)
	addl	$16, %ecx

	subl	$1, %eax
.L33:
	jne	.L34

#ifdef USE_INNER_EMMS
	emms
#endif
	movl	%ebp, %esp

	/* At this point there are either [0, 3] pixels remaining to be
	 * converted.
	 */

	testl	$2, %edx
	je	.L36

	movq	(%ebx), %mm0
	addl	$8, %ebx

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	
	pand	%mm2, %mm3
	psllq	$16, %mm4
	psrlq	$16, %mm3
	pand	%mm2, %mm4

	pand	%mm1, %mm0
	por	%mm4, %mm3
	por	%mm3, %mm0

	movq	%mm0, (%ecx)
	addl	$8, %ecx
.L36:

	testl	$1, %edx
	je	.L35

	DO_ONE_LAST_PIXEL()
.L35:
	popl	%ebp
	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE


/**
 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 */

	.text
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
#endif
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
	pushl	%esi
	pushl	%ebx

	LOAD_MASK(movdqu,%xmm1,%xmm2)

	movl	12(%esp), %ebx	/* source pointer */
	movl	20(%esp), %edx	/* number of pixels to copy */
	movl	16(%esp), %ecx	/* destination pointer */

	movl	%ebx, %eax
	movl	%edx, %esi

	testl	%edx, %edx
	jle	.L46		/* Bail if there's nothing to do. */

	/* If the source pointer isn't a multiple of 16 we have to process
	 * a few pixels the "slow" way to get the address aligned for
	 * the SSE fetch intsructions.
	 */

	negl	%eax
	andl	$15, %eax
	sarl	$2, %eax

	cmpl	%edx, %eax
	cmovbe	%eax, %esi
	subl	%esi, %edx

	testl	$1, %esi
	je	.L41

	DO_ONE_PIXEL()  
.L41:
	testl	$2, %esi
	je	.L40

	movq	(%ebx), %xmm0
	addl	$8, %ebx

	movdqa	%xmm0, %xmm3
	movdqa	%xmm0, %xmm4
	andps	%xmm1, %xmm0

	andps	%xmm2, %xmm3
	pslldq	$2, %xmm4
	psrldq	$2, %xmm3
	andps	%xmm2, %xmm4

	orps	%xmm4, %xmm3
	orps	%xmm3, %xmm0

	movq	%xmm0, (%ecx)
	addl	$8, %ecx
.L40:

	/* Would it be worth having a specialized version of this loop for
	 * the case where the destination is 16-byte aligned?  That version
	 * would be identical except that it could use movedqa instead of
	 * movdqu.
	 */

	movl	%edx, %eax
	shrl	$2, %eax
	jmp	.L42
.L43:
	movdqa	(%ebx), %xmm0
	addl	$16, %ebx

	movdqa	%xmm0, %xmm3
	movdqa	%xmm0, %xmm4
	andps	%xmm1, %xmm0

	andps	%xmm2, %xmm3
	pslldq	$2, %xmm4
	psrldq	$2, %xmm3
	andps	%xmm2, %xmm4

	orps	%xmm4, %xmm3
	orps	%xmm3, %xmm0

	movdqu	%xmm0, (%ecx)
	addl	$16, %ecx
	subl	$1, %eax
.L42:
	jne	.L43


	/* There may be upto 3 pixels remaining to be copied.  Take care
	 * of them now.  We do the 2 pixel case first because the data
	 * will be aligned.
	 */

	testl	$2, %edx
	je	.L47

	movq	(%ebx), %xmm0
	addl	$8, %ebx
        
	movdqa	%xmm0, %xmm3
	movdqa	%xmm0, %xmm4
	andps	%xmm1, %xmm0

	andps	%xmm2, %xmm3
	pslldq	$2, %xmm4
	psrldq	$2, %xmm3
	andps	%xmm2, %xmm4

	orps	%xmm4, %xmm3
	orps	%xmm3, %xmm0

	movq	%xmm0, (%ecx)
	addl	$8, %ecx        
.L47:

	testl	$1, %edx
	je	.L46

	DO_ONE_LAST_PIXEL()  
.L46:

	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2



#define MASK_565_L	0x07e0f800
#define MASK_565_H	0x0000001f
/* Setting SCALE_ADJUST to 5 gives a perfect match with the
 * classic C implementation in Mesa.  Setting SCALE_ADJUST
 * to 0 is slightly faster but at a small cost to accuracy.
 */
#define SCALE_ADJUST	5
#if SCALE_ADJUST == 5
#define PRESCALE_L 0x00100001
#define PRESCALE_H 0x00000200
#define SCALE_L 0x40C620E8
#define SCALE_H 0x0000839d
#elif SCALE_ADJUST == 0
#define PRESCALE_L 0x00200001
#define PRESCALE_H 0x00000800
#define SCALE_L 0x01040108
#define SCALE_H 0x00000108
#else
#error SCALE_ADJUST must either be 5 or 0.
#endif
#define ALPHA_L 0x00000000
#define ALPHA_H 0x00ff0000

/**
 * MMX optimized version of the RGB565 to RGBA copy routine.
 */

	.text
	.globl	_generic_read_RGBA_span_RGB565_MMX
#ifndef USE_DRICORE
        .hidden _generic_read_RGBA_span_RGB565_MMX
#endif
	.type	_generic_read_RGBA_span_RGB565_MMX, @function

_generic_read_RGBA_span_RGB565_MMX:

#ifdef USE_INNER_EMMS
	emms
#endif

	movl	4(%esp), %eax	/* source pointer */
	movl	8(%esp), %edx	/* destination pointer */
	movl	12(%esp), %ecx	/* number of pixels to copy */

	pushl	$MASK_565_H
	pushl	$MASK_565_L
	movq	(%esp), %mm5
	pushl	$PRESCALE_H
	pushl	$PRESCALE_L
	movq	(%esp), %mm6
	pushl	$SCALE_H
	pushl	$SCALE_L
	movq	(%esp), %mm7
	pushl	$ALPHA_H
	pushl	$ALPHA_L
	movq	(%esp), %mm3
	addl	$32,%esp

	sarl	$2, %ecx
	jl	.L01		/* Bail early if the count is negative. */
	jmp	.L02

.L03:
	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
	 * second pixels into the four words of %mm0 and %mm2.
      	 */

	movq	(%eax), %mm4
	addl	$8, %eax

	pshufw	$0x00, %mm4, %mm0
	pshufw	$0x55, %mm4, %mm2


	/* Mask the pixels so that each word of each register contains only
	 * one color component.
	 */

	pand	%mm5, %mm0
	pand	%mm5, %mm2


	/* Adjust the component values so that they are as small as possible,
	 * but large enough so that we can multiply them by an unsigned 16-bit
	 * number and get a value as large as 0x00ff0000.
 	 */

	pmullw	%mm6, %mm0
	pmullw	%mm6, %mm2
#if SCALE_ADJUST > 0
	psrlw	$SCALE_ADJUST, %mm0
	psrlw	$SCALE_ADJUST, %mm2
#endif

	/* Scale the input component values to be on the range
	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
	 */

	pmulhuw	%mm7, %mm0
	pmulhuw	%mm7, %mm2


	/* Always set the alpha value to 0xff.
	 */

 	por %mm3, %mm0
 	por %mm3, %mm2


	/* Pack the 16-bit values to 8-bit values and store the converted
	 * pixel data.
	 */

	packuswb	%mm2, %mm0
	movq	%mm0, (%edx)
	addl	$8, %edx

	pshufw	$0xaa, %mm4, %mm0
	pshufw	$0xff, %mm4, %mm2

	pand	%mm5, %mm0
	pand	%mm5, %mm2
	pmullw	%mm6, %mm0
	pmullw	%mm6, %mm2
#if SCALE_ADJUST > 0
	psrlw	$SCALE_ADJUST, %mm0
	psrlw	$SCALE_ADJUST, %mm2
#endif
	pmulhuw	%mm7, %mm0
	pmulhuw	%mm7, %mm2

 	por %mm3, %mm0
 	por %mm3, %mm2

	packuswb	%mm2, %mm0

	movq	%mm0, (%edx)
	addl	$8, %edx

	subl	$1, %ecx
.L02:
	jne	.L03


	/* At this point there can be at most 3 pixels left to process.  If
	 * there is either 2 or 3 left, process 2.
         */

	movl	12(%esp), %ecx
	testl	$0x02, %ecx
	je	.L04

	movd	(%eax), %mm4
	addl	$4, %eax

	pshufw	$0x00, %mm4, %mm0
	pshufw	$0x55, %mm4, %mm2

	pand	%mm5, %mm0
	pand	%mm5, %mm2
	pmullw	%mm6, %mm0
	pmullw	%mm6, %mm2
#if SCALE_ADJUST > 0
	psrlw	$SCALE_ADJUST, %mm0
	psrlw	$SCALE_ADJUST, %mm2
#endif
	pmulhuw	%mm7, %mm0
	pmulhuw	%mm7, %mm2

 	por %mm3, %mm0
 	por %mm3, %mm2

	packuswb	%mm2, %mm0

	movq	%mm0, (%edx)
	addl	$8, %edx

.L04:
	/* At this point there can be at most 1 pixel left to process.
	 * Process it if needed.
         */

	testl	$0x01, %ecx
	je	.L01

	movzwl	(%eax), %ecx
	movd	%ecx, %mm4

	pshufw	$0x00, %mm4, %mm0

	pand	%mm5, %mm0
	pmullw	%mm6, %mm0
#if SCALE_ADJUST > 0
	psrlw	$SCALE_ADJUST, %mm0
#endif
	pmulhuw	%mm7, %mm0

 	por %mm3, %mm0

	packuswb	%mm0, %mm0

	movd	%mm0, (%edx)

.L01:
#ifdef USE_INNER_EMMS
	emms
#endif
	ret
#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
	
#if defined (__ELF__) && defined (__linux__)
	.section .note.GNU-stack,"",%progbits
#endif