/*
Copyright (c) 2010, Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
    * this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright notice,
    * this list of conditions and the following disclaimer in the documentation
    * and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its contributors
    * may be used to endorse or promote products derived from this software
    * without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "cache.h"

#ifndef MEMCPY
# define MEMCPY	memcpy
#endif

#ifndef L
# define L(label)	.L##label
#endif

#ifndef cfi_startproc
# define cfi_startproc	.cfi_startproc
#endif

#ifndef cfi_endproc
# define cfi_endproc	.cfi_endproc
#endif

#ifndef cfi_rel_offset
# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
#endif

#ifndef cfi_restore
# define cfi_restore(reg)	.cfi_restore reg
#endif

#ifndef cfi_adjust_cfa_offset
# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
#endif

#ifndef ENTRY
# define ENTRY(name)		\
	.type name,  @function;		\
	.globl name;		\
	.p2align 4;		\
name:		\
	cfi_startproc
#endif

#ifndef END
# define END(name)		\
	cfi_endproc;		\
	.size name, .-name
#endif

#define DEST		PARMS
#define SRC		DEST+4
#define LEN		SRC+4

#define CFI_PUSH(REG)		\
  cfi_adjust_cfa_offset (4);		\
  cfi_rel_offset (REG, 0)

#define CFI_POP(REG)		\
  cfi_adjust_cfa_offset (-4);		\
  cfi_restore (REG)

#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
#define POP(REG)	popl REG; CFI_POP (REG)

#if (defined SHARED || defined __PIC__)
# define PARMS		8		/* Preserve EBX.  */
# define ENTRANCE	PUSH (%ebx);
# define RETURN_END	POP (%ebx); ret
# define RETURN		RETURN_END; CFI_PUSH (%ebx)
# define JMPTBL(I, B)	I - B

# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x

/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
	jump table with relative offsets.  INDEX is a register contains the
	index into the jump table.   SCALE is the scale of INDEX. */

# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    /* We first load PC into EBX.  */		\
	SETUP_PIC_REG(bx);		\
    /* Get the address of the jump table.  */		\
	addl	$(TABLE - .), %ebx;		\
    /* Get the entry and convert the relative offset to the		\
	absolute	address.  */		\
	addl	(%ebx, INDEX, SCALE), %ebx;		\
    /* We loaded the jump table.  Go.  */		\
	jmp	*%ebx
#else

# define PARMS		4
# define ENTRANCE
# define RETURN_END	ret
# define RETURN		RETURN_END
# define JMPTBL(I, B)	I

/* Branch to an entry in a jump table.  TABLE is a jump table with
	absolute offsets.  INDEX is a register contains the index into the
	jump table.  SCALE is the scale of INDEX. */

# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
	jmp	*TABLE(, INDEX, SCALE)
#endif

	.section .text.ssse3,"ax",@progbits
ENTRY (MEMCPY)
	ENTRANCE
	movl	LEN(%esp), %ecx
	movl	SRC(%esp), %eax
	movl	DEST(%esp), %edx

#ifdef USE_AS_MEMMOVE
	cmp	%eax, %edx
	jb	L(copy_forward)
	je	L(fwd_write_0bytes)
	cmp	$32, %ecx
	jae	L(memmove_bwd)
	jmp	L(bk_write_less32bytes_2)

	.p2align 4
L(memmove_bwd):
	add	%ecx, %eax
	cmp	%eax, %edx
	movl	SRC(%esp), %eax
	jb	L(copy_backward)

L(copy_forward):
#endif
	cmp	$48, %ecx
	jae	L(48bytesormore)

L(fwd_write_less32bytes):
#ifndef USE_AS_MEMMOVE
	cmp	%dl, %al
	jb	L(bk_write)
#endif
	add	%ecx, %edx
	add	%ecx, %eax
	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
#ifndef USE_AS_MEMMOVE
	.p2align 4
L(bk_write):
	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
#endif

	.p2align 4
L(48bytesormore):
#ifndef USE_AS_MEMMOVE
	movlpd	(%eax), %xmm0
	movlpd	8(%eax), %xmm1
	movlpd	%xmm0, (%edx)
	movlpd	%xmm1, 8(%edx)
#else
	movdqu	(%eax), %xmm0
#endif
	PUSH (%edi)
	movl	%edx, %edi
	and	$-16, %edx
	add	$16, %edx
	sub	%edx, %edi
	add	%edi, %ecx
	sub	%edi, %eax

#ifdef SHARED_CACHE_SIZE_HALF
	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_shared_cache_size_half, %ecx
# endif
#endif

	mov	%eax, %edi
	jae	L(large_page)
	and	$0xf, %edi
	jz	L(shl_0)
	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)

	.p2align 4
L(shl_0):
#ifdef USE_AS_MEMMOVE
	movl	DEST+4(%esp), %edi
	movdqu	%xmm0, (%edi)
#endif
	xor	%edi, %edi
	cmp	$127, %ecx
	ja	L(shl_0_gobble)
	lea	-32(%ecx), %ecx

	.p2align 4
L(shl_0_loop):
	movdqa	(%eax, %edi), %xmm0
	movdqa	16(%eax, %edi), %xmm1
	sub	$32, %ecx
	movdqa	%xmm0, (%edx, %edi)
	movdqa	%xmm1, 16(%edx, %edi)
	lea	32(%edi), %edi
	jb	L(shl_0_end)

	movdqa	(%eax, %edi), %xmm0
	movdqa	16(%eax, %edi), %xmm1
	sub	$32, %ecx
	movdqa	%xmm0, (%edx, %edi)
	movdqa	%xmm1, 16(%edx, %edi)
	lea	32(%edi), %edi
	jb	L(shl_0_end)

	movdqa	(%eax, %edi), %xmm0
	movdqa	16(%eax, %edi), %xmm1
	sub	$32, %ecx
	movdqa	%xmm0, (%edx, %edi)
	movdqa	%xmm1, 16(%edx, %edi)
	lea	32(%edi), %edi
	jb	L(shl_0_end)

	movdqa	(%eax, %edi), %xmm0
	movdqa	16(%eax, %edi), %xmm1
	sub	$32, %ecx
	movdqa	%xmm0, (%edx, %edi)
	movdqa	%xmm1, 16(%edx, %edi)
	lea	32(%edi), %edi

L(shl_0_end):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	add	%edi, %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_0_gobble):
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	POP	(%edi)
	lea	-128(%ecx), %ecx
	jae	L(shl_0_gobble_mem_loop)

	.p2align 4
L(shl_0_gobble_cache_loop):
	movdqa	(%eax), %xmm0
	movdqa	0x10(%eax), %xmm1
	movdqa	0x20(%eax), %xmm2
	movdqa	0x30(%eax), %xmm3
	movdqa	0x40(%eax), %xmm4
	movdqa	0x50(%eax), %xmm5
	movdqa	0x60(%eax), %xmm6
	movdqa	0x70(%eax), %xmm7
	lea	0x80(%eax), %eax
	sub	$128, %ecx
	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)
	movdqa	%xmm2, 0x20(%edx)
	movdqa	%xmm3, 0x30(%edx)
	movdqa	%xmm4, 0x40(%edx)
	movdqa	%xmm5, 0x50(%edx)
	movdqa	%xmm6, 0x60(%edx)
	movdqa	%xmm7, 0x70(%edx)
	lea	0x80(%edx), %edx

	jae	L(shl_0_gobble_cache_loop)
	cmp	$-0x40, %ecx
	lea	0x80(%ecx), %ecx
	jl	L(shl_0_cache_less_64bytes)

	movdqa	(%eax), %xmm0
	sub	$0x40, %ecx
	movdqa	0x10(%eax), %xmm1
	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)
	movdqa	0x20(%eax), %xmm0
	movdqa	0x30(%eax), %xmm1
	add	$0x40, %eax
	movdqa	%xmm0, 0x20(%edx)
	movdqa	%xmm1, 0x30(%edx)
	add	$0x40, %edx

L(shl_0_cache_less_64bytes):
	cmp	$0x20, %ecx
	jb	L(shl_0_cache_less_32bytes)
	movdqa	(%eax), %xmm0
	sub	$0x20, %ecx
	movdqa	0x10(%eax), %xmm1
	add	$0x20, %eax
	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)
	add	$0x20, %edx

L(shl_0_cache_less_32bytes):
	cmp	$0x10, %ecx
	jb	L(shl_0_cache_less_16bytes)
	sub	$0x10, %ecx
	movdqa	(%eax), %xmm0
	add	$0x10, %eax
	movdqa	%xmm0, (%edx)
	add	$0x10, %edx

L(shl_0_cache_less_16bytes):
	add	%ecx, %edx
	add	%ecx, %eax
	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)

	.p2align 4
L(shl_0_gobble_mem_loop):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x280(%eax)
	prefetcht0 0x1c0(%edx)

	movdqa	(%eax), %xmm0
	movdqa	0x10(%eax), %xmm1
	movdqa	0x20(%eax), %xmm2
	movdqa	0x30(%eax), %xmm3
	movdqa	0x40(%eax), %xmm4
	movdqa	0x50(%eax), %xmm5
	movdqa	0x60(%eax), %xmm6
	movdqa	0x70(%eax), %xmm7
	lea	0x80(%eax), %eax
	sub	$0x80, %ecx
	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)
	movdqa	%xmm2, 0x20(%edx)
	movdqa	%xmm3, 0x30(%edx)
	movdqa	%xmm4, 0x40(%edx)
	movdqa	%xmm5, 0x50(%edx)
	movdqa	%xmm6, 0x60(%edx)
	movdqa	%xmm7, 0x70(%edx)
	lea	0x80(%edx), %edx

	jae	L(shl_0_gobble_mem_loop)
	cmp	$-0x40, %ecx
	lea	0x80(%ecx), %ecx
	jl	L(shl_0_mem_less_64bytes)

	movdqa	(%eax), %xmm0
	sub	$0x40, %ecx
	movdqa	0x10(%eax), %xmm1

	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)

	movdqa	0x20(%eax), %xmm0
	movdqa	0x30(%eax), %xmm1
	add	$0x40, %eax

	movdqa	%xmm0, 0x20(%edx)
	movdqa	%xmm1, 0x30(%edx)
	add	$0x40, %edx

L(shl_0_mem_less_64bytes):
	cmp	$0x20, %ecx
	jb	L(shl_0_mem_less_32bytes)
	movdqa	(%eax), %xmm0
	sub	$0x20, %ecx
	movdqa	0x10(%eax), %xmm1
	add	$0x20, %eax
	movdqa	%xmm0, (%edx)
	movdqa	%xmm1, 0x10(%edx)
	add	$0x20, %edx

L(shl_0_mem_less_32bytes):
	cmp	$0x10, %ecx
	jb	L(shl_0_mem_less_16bytes)
	sub	$0x10, %ecx
	movdqa	(%eax), %xmm0
	add	$0x10, %eax
	movdqa	%xmm0, (%edx)
	add	$0x10, %edx

L(shl_0_mem_less_16bytes):
	add	%ecx, %edx
	add	%ecx, %eax
	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)

	.p2align 4
L(shl_1):
#ifndef USE_AS_MEMMOVE
	movaps	-1(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-1(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_1_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl1LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	15(%eax), %xmm2
	movaps	31(%eax), %xmm3
	movaps	47(%eax), %xmm4
	movaps	63(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$1, %xmm4, %xmm5
	palignr	$1, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$1, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$1, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl1LoopStart)

L(Shl1LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	15(%eax), %xmm2
	movaps	31(%eax), %xmm3
	palignr	$1, %xmm2, %xmm3
	palignr	$1, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_1_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-1(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_1_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$1, %xmm2, %xmm3
	palignr	$1, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_1_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$1, %xmm2, %xmm3
	palignr	$1, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_1_no_prefetch_loop)

L(sh_1_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	1(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_2):
#ifndef USE_AS_MEMMOVE
	movaps	-2(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-2(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_2_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl2LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	14(%eax), %xmm2
	movaps	30(%eax), %xmm3
	movaps	46(%eax), %xmm4
	movaps	62(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$2, %xmm4, %xmm5
	palignr	$2, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$2, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$2, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl2LoopStart)

L(Shl2LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	14(%eax), %xmm2
	movaps	30(%eax), %xmm3
	palignr	$2, %xmm2, %xmm3
	palignr	$2, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_2_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-2(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_2_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$2, %xmm2, %xmm3
	palignr	$2, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_2_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$2, %xmm2, %xmm3
	palignr	$2, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_2_no_prefetch_loop)

L(sh_2_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	2(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_3):
#ifndef USE_AS_MEMMOVE
	movaps	-3(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-3(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_3_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl3LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	13(%eax), %xmm2
	movaps	29(%eax), %xmm3
	movaps	45(%eax), %xmm4
	movaps	61(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$3, %xmm4, %xmm5
	palignr	$3, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$3, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$3, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl3LoopStart)

L(Shl3LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	13(%eax), %xmm2
	movaps	29(%eax), %xmm3
	palignr	$3, %xmm2, %xmm3
	palignr	$3, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_3_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-3(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_3_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$3, %xmm2, %xmm3
	palignr	$3, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jb	L(sh_3_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$3, %xmm2, %xmm3
	palignr	$3, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jae	L(sh_3_no_prefetch_loop)

L(sh_3_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	3(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_4):
#ifndef USE_AS_MEMMOVE
	movaps	-4(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-4(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_4_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl4LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	12(%eax), %xmm2
	movaps	28(%eax), %xmm3
	movaps	44(%eax), %xmm4
	movaps	60(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$4, %xmm4, %xmm5
	palignr	$4, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$4, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$4, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl4LoopStart)

L(Shl4LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	12(%eax), %xmm2
	movaps	28(%eax), %xmm3
	palignr	$4, %xmm2, %xmm3
	palignr	$4, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_4_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-4(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_4_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$4, %xmm2, %xmm3
	palignr	$4, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jb	L(sh_4_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$4, %xmm2, %xmm3
	palignr	$4, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jae	L(sh_4_no_prefetch_loop)

L(sh_4_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	4(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_5):
#ifndef USE_AS_MEMMOVE
	movaps	-5(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-5(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_5_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl5LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	11(%eax), %xmm2
	movaps	27(%eax), %xmm3
	movaps	43(%eax), %xmm4
	movaps	59(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$5, %xmm4, %xmm5
	palignr	$5, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$5, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$5, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl5LoopStart)

L(Shl5LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	11(%eax), %xmm2
	movaps	27(%eax), %xmm3
	palignr	$5, %xmm2, %xmm3
	palignr	$5, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_5_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-5(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_5_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$5, %xmm2, %xmm3
	palignr	$5, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jb	L(sh_5_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$5, %xmm2, %xmm3
	palignr	$5, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jae	L(sh_5_no_prefetch_loop)

L(sh_5_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	5(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_6):
#ifndef USE_AS_MEMMOVE
	movaps	-6(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-6(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_6_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl6LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	10(%eax), %xmm2
	movaps	26(%eax), %xmm3
	movaps	42(%eax), %xmm4
	movaps	58(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$6, %xmm4, %xmm5
	palignr	$6, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$6, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$6, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl6LoopStart)

L(Shl6LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	10(%eax), %xmm2
	movaps	26(%eax), %xmm3
	palignr	$6, %xmm2, %xmm3
	palignr	$6, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_6_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-6(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_6_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$6, %xmm2, %xmm3
	palignr	$6, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jb	L(sh_6_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$6, %xmm2, %xmm3
	palignr	$6, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)

	jae	L(sh_6_no_prefetch_loop)

L(sh_6_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	6(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_7):
#ifndef USE_AS_MEMMOVE
	movaps	-7(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-7(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_7_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl7LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	9(%eax), %xmm2
	movaps	25(%eax), %xmm3
	movaps	41(%eax), %xmm4
	movaps	57(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$7, %xmm4, %xmm5
	palignr	$7, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$7, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$7, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl7LoopStart)

L(Shl7LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	9(%eax), %xmm2
	movaps	25(%eax), %xmm3
	palignr	$7, %xmm2, %xmm3
	palignr	$7, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_7_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-7(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_7_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$7, %xmm2, %xmm3
	palignr	$7, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_7_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$7, %xmm2, %xmm3
	palignr	$7, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_7_no_prefetch_loop)

L(sh_7_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	7(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_8):
#ifndef USE_AS_MEMMOVE
	movaps	-8(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-8(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_8_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl8LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	8(%eax), %xmm2
	movaps	24(%eax), %xmm3
	movaps	40(%eax), %xmm4
	movaps	56(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$8, %xmm4, %xmm5
	palignr	$8, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$8, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$8, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl8LoopStart)

L(LoopLeave8):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	8(%eax), %xmm2
	movaps	24(%eax), %xmm3
	palignr	$8, %xmm2, %xmm3
	palignr	$8, %xmm1, %xmm2
	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_8_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-8(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_8_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$8, %xmm2, %xmm3
	palignr	$8, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_8_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$8, %xmm2, %xmm3
	palignr	$8, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_8_no_prefetch_loop)

L(sh_8_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	8(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_9):
#ifndef USE_AS_MEMMOVE
	movaps	-9(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-9(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_9_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl9LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	7(%eax), %xmm2
	movaps	23(%eax), %xmm3
	movaps	39(%eax), %xmm4
	movaps	55(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$9, %xmm4, %xmm5
	palignr	$9, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$9, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$9, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl9LoopStart)

L(Shl9LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	7(%eax), %xmm2
	movaps	23(%eax), %xmm3
	palignr	$9, %xmm2, %xmm3
	palignr	$9, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_9_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-9(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_9_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$9, %xmm2, %xmm3
	palignr	$9, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_9_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$9, %xmm2, %xmm3
	palignr	$9, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_9_no_prefetch_loop)

L(sh_9_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	9(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_10):
#ifndef USE_AS_MEMMOVE
	movaps	-10(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-10(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_10_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl10LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	6(%eax), %xmm2
	movaps	22(%eax), %xmm3
	movaps	38(%eax), %xmm4
	movaps	54(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$10, %xmm4, %xmm5
	palignr	$10, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$10, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$10, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl10LoopStart)

L(Shl10LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	6(%eax), %xmm2
	movaps	22(%eax), %xmm3
	palignr	$10, %xmm2, %xmm3
	palignr	$10, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_10_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-10(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_10_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$10, %xmm2, %xmm3
	palignr	$10, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_10_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$10, %xmm2, %xmm3
	palignr	$10, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_10_no_prefetch_loop)

L(sh_10_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	10(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_11):
#ifndef USE_AS_MEMMOVE
	movaps	-11(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-11(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_11_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl11LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	5(%eax), %xmm2
	movaps	21(%eax), %xmm3
	movaps	37(%eax), %xmm4
	movaps	53(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$11, %xmm4, %xmm5
	palignr	$11, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$11, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$11, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl11LoopStart)

L(Shl11LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	5(%eax), %xmm2
	movaps	21(%eax), %xmm3
	palignr	$11, %xmm2, %xmm3
	palignr	$11, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_11_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-11(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_11_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$11, %xmm2, %xmm3
	palignr	$11, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_11_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$11, %xmm2, %xmm3
	palignr	$11, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_11_no_prefetch_loop)

L(sh_11_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	11(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_12):
#ifndef USE_AS_MEMMOVE
	movaps	-12(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-12(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_12_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl12LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	4(%eax), %xmm2
	movaps	20(%eax), %xmm3
	movaps	36(%eax), %xmm4
	movaps	52(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$12, %xmm4, %xmm5
	palignr	$12, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$12, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$12, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl12LoopStart)

L(Shl12LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	4(%eax), %xmm2
	movaps	20(%eax), %xmm3
	palignr	$12, %xmm2, %xmm3
	palignr	$12, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_12_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-12(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_12_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$12, %xmm2, %xmm3
	palignr	$12, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_12_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$12, %xmm2, %xmm3
	palignr	$12, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_12_no_prefetch_loop)

L(sh_12_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	12(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_13):
#ifndef USE_AS_MEMMOVE
	movaps	-13(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-13(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_13_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl13LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	3(%eax), %xmm2
	movaps	19(%eax), %xmm3
	movaps	35(%eax), %xmm4
	movaps	51(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$13, %xmm4, %xmm5
	palignr	$13, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$13, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$13, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl13LoopStart)

L(Shl13LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	3(%eax), %xmm2
	movaps	19(%eax), %xmm3
	palignr	$13, %xmm2, %xmm3
	palignr	$13, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_13_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-13(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_13_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$13, %xmm2, %xmm3
	palignr	$13, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_13_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$13, %xmm2, %xmm3
	palignr	$13, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_13_no_prefetch_loop)

L(sh_13_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	13(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_14):
#ifndef USE_AS_MEMMOVE
	movaps	-14(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-14(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_14_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl14LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	2(%eax), %xmm2
	movaps	18(%eax), %xmm3
	movaps	34(%eax), %xmm4
	movaps	50(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$14, %xmm4, %xmm5
	palignr	$14, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$14, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$14, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl14LoopStart)

L(Shl14LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	2(%eax), %xmm2
	movaps	18(%eax), %xmm3
	palignr	$14, %xmm2, %xmm3
	palignr	$14, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_14_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-14(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_14_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$14, %xmm2, %xmm3
	palignr	$14, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_14_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$14, %xmm2, %xmm3
	palignr	$14, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_14_no_prefetch_loop)

L(sh_14_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	14(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_15):
#ifndef USE_AS_MEMMOVE
	movaps	-15(%eax), %xmm1
#else
	movl	DEST+4(%esp), %edi
	movaps	-15(%eax), %xmm1
	movdqu	%xmm0, (%edi)
#endif
#ifdef DATA_CACHE_SIZE_HALF
	cmp	$DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
	SETUP_PIC_REG(bx)
	add	$_GLOBAL_OFFSET_TABLE_, %ebx
	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
	cmp	__x86_data_cache_size_half, %ecx
# endif
#endif
	jb L(sh_15_no_prefetch)

	lea	-64(%ecx), %ecx

	.p2align 4
L(Shl15LoopStart):
	prefetcht0 0x1c0(%eax)
	prefetcht0 0x1c0(%edx)
	movaps	1(%eax), %xmm2
	movaps	17(%eax), %xmm3
	movaps	33(%eax), %xmm4
	movaps	49(%eax), %xmm5
	movaps	%xmm5, %xmm7
	palignr	$15, %xmm4, %xmm5
	palignr	$15, %xmm3, %xmm4
	movaps	%xmm5, 48(%edx)
	palignr	$15, %xmm2, %xmm3
	lea	64(%eax), %eax
	palignr	$15, %xmm1, %xmm2
	movaps	%xmm4, 32(%edx)
	movaps	%xmm3, 16(%edx)
	movaps	%xmm7, %xmm1
	movaps	%xmm2, (%edx)
	lea	64(%edx), %edx
	sub	$64, %ecx
	ja	L(Shl15LoopStart)

L(Shl15LoopLeave):
	add	$32, %ecx
	jle	L(shl_end_0)

	movaps	1(%eax), %xmm2
	movaps	17(%eax), %xmm3
	palignr	$15, %xmm2, %xmm3
	palignr	$15, %xmm1, %xmm2

	movaps	%xmm2, (%edx)
	movaps	%xmm3, 16(%edx)
	lea	32(%edx, %ecx), %edx
	lea	32(%eax, %ecx), %eax
	POP (%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(sh_15_no_prefetch):
	lea	-32(%ecx), %ecx
	lea	-15(%eax), %eax
	xor	%edi, %edi

	.p2align 4
L(sh_15_no_prefetch_loop):
	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm4
	palignr	$15, %xmm2, %xmm3
	palignr	$15, %xmm1, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jb	L(sh_15_end_no_prefetch_loop)

	movdqa	16(%eax, %edi), %xmm2
	sub	$32, %ecx
	movdqa	32(%eax, %edi), %xmm3
	movdqa	%xmm3, %xmm1
	palignr	$15, %xmm2, %xmm3
	palignr	$15, %xmm4, %xmm2
	lea	32(%edi), %edi
	movdqa	%xmm2, -32(%edx, %edi)
	movdqa	%xmm3, -16(%edx, %edi)
	jae	L(sh_15_no_prefetch_loop)

L(sh_15_end_no_prefetch_loop):
	lea	32(%ecx), %ecx
	add	%ecx, %edi
	add	%edi, %edx
	lea	15(%edi, %eax), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(shl_end_0):
	lea	32(%ecx), %ecx
	lea	(%edx, %ecx), %edx
	lea	(%eax, %ecx), %eax
	POP	(%edi)
	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)

	.p2align 4
L(fwd_write_44bytes):
	movq	-44(%eax), %xmm0
	movq	%xmm0, -44(%edx)
L(fwd_write_36bytes):
	movq	-36(%eax), %xmm0
	movq	%xmm0, -36(%edx)
L(fwd_write_28bytes):
	movq	-28(%eax), %xmm0
	movq	%xmm0, -28(%edx)
L(fwd_write_20bytes):
	movq	-20(%eax), %xmm0
	movq	%xmm0, -20(%edx)
L(fwd_write_12bytes):
	movq	-12(%eax), %xmm0
	movq	%xmm0, -12(%edx)
L(fwd_write_4bytes):
	movl	-4(%eax), %ecx
	movl	%ecx, -4(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_40bytes):
	movq	-40(%eax), %xmm0
	movq	%xmm0, -40(%edx)
L(fwd_write_32bytes):
	movq	-32(%eax), %xmm0
	movq	%xmm0, -32(%edx)
L(fwd_write_24bytes):
	movq	-24(%eax), %xmm0
	movq	%xmm0, -24(%edx)
L(fwd_write_16bytes):
	movq	-16(%eax), %xmm0
	movq	%xmm0, -16(%edx)
L(fwd_write_8bytes):
	movq	-8(%eax), %xmm0
	movq	%xmm0, -8(%edx)
L(fwd_write_0bytes):
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_5bytes):
	movl	-5(%eax), %ecx
	movl	-4(%eax), %eax
	movl	%ecx, -5(%edx)
	movl	%eax, -4(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_45bytes):
	movq	-45(%eax), %xmm0
	movq	%xmm0, -45(%edx)
L(fwd_write_37bytes):
	movq	-37(%eax), %xmm0
	movq	%xmm0, -37(%edx)
L(fwd_write_29bytes):
	movq	-29(%eax), %xmm0
	movq	%xmm0, -29(%edx)
L(fwd_write_21bytes):
	movq	-21(%eax), %xmm0
	movq	%xmm0, -21(%edx)
L(fwd_write_13bytes):
	movq	-13(%eax), %xmm0
	movq	%xmm0, -13(%edx)
	movl	-5(%eax), %ecx
	movl	%ecx, -5(%edx)
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_41bytes):
	movq	-41(%eax), %xmm0
	movq	%xmm0, -41(%edx)
L(fwd_write_33bytes):
	movq	-33(%eax), %xmm0
	movq	%xmm0, -33(%edx)
L(fwd_write_25bytes):
	movq	-25(%eax), %xmm0
	movq	%xmm0, -25(%edx)
L(fwd_write_17bytes):
	movq	-17(%eax), %xmm0
	movq	%xmm0, -17(%edx)
L(fwd_write_9bytes):
	movq	-9(%eax), %xmm0
	movq	%xmm0, -9(%edx)
L(fwd_write_1bytes):
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_46bytes):
	movq	-46(%eax), %xmm0
	movq	%xmm0, -46(%edx)
L(fwd_write_38bytes):
	movq	-38(%eax), %xmm0
	movq	%xmm0, -38(%edx)
L(fwd_write_30bytes):
	movq	-30(%eax), %xmm0
	movq	%xmm0, -30(%edx)
L(fwd_write_22bytes):
	movq	-22(%eax), %xmm0
	movq	%xmm0, -22(%edx)
L(fwd_write_14bytes):
	movq	-14(%eax), %xmm0
	movq	%xmm0, -14(%edx)
L(fwd_write_6bytes):
	movl	-6(%eax), %ecx
	movl	%ecx, -6(%edx)
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_42bytes):
	movq	-42(%eax), %xmm0
	movq	%xmm0, -42(%edx)
L(fwd_write_34bytes):
	movq	-34(%eax), %xmm0
	movq	%xmm0, -34(%edx)
L(fwd_write_26bytes):
	movq	-26(%eax), %xmm0
	movq	%xmm0, -26(%edx)
L(fwd_write_18bytes):
	movq	-18(%eax), %xmm0
	movq	%xmm0, -18(%edx)
L(fwd_write_10bytes):
	movq	-10(%eax), %xmm0
	movq	%xmm0, -10(%edx)
L(fwd_write_2bytes):
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_47bytes):
	movq	-47(%eax), %xmm0
	movq	%xmm0, -47(%edx)
L(fwd_write_39bytes):
	movq	-39(%eax), %xmm0
	movq	%xmm0, -39(%edx)
L(fwd_write_31bytes):
	movq	-31(%eax), %xmm0
	movq	%xmm0, -31(%edx)
L(fwd_write_23bytes):
	movq	-23(%eax), %xmm0
	movq	%xmm0, -23(%edx)
L(fwd_write_15bytes):
	movq	-15(%eax), %xmm0
	movq	%xmm0, -15(%edx)
L(fwd_write_7bytes):
	movl	-7(%eax), %ecx
	movl	%ecx, -7(%edx)
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_43bytes):
	movq	-43(%eax), %xmm0
	movq	%xmm0, -43(%edx)
L(fwd_write_35bytes):
	movq	-35(%eax), %xmm0
	movq	%xmm0, -35(%edx)
L(fwd_write_27bytes):
	movq	-27(%eax), %xmm0
	movq	%xmm0, -27(%edx)
L(fwd_write_19bytes):
	movq	-19(%eax), %xmm0
	movq	%xmm0, -19(%edx)
L(fwd_write_11bytes):
	movq	-11(%eax), %xmm0
	movq	%xmm0, -11(%edx)
L(fwd_write_3bytes):
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_40bytes_align):
	movdqa	-40(%eax), %xmm0
	movdqa	%xmm0, -40(%edx)
L(fwd_write_24bytes_align):
	movdqa	-24(%eax), %xmm0
	movdqa	%xmm0, -24(%edx)
L(fwd_write_8bytes_align):
	movq	-8(%eax), %xmm0
	movq	%xmm0, -8(%edx)
L(fwd_write_0bytes_align):
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_32bytes_align):
	movdqa	-32(%eax), %xmm0
	movdqa	%xmm0, -32(%edx)
L(fwd_write_16bytes_align):
	movdqa	-16(%eax), %xmm0
	movdqa	%xmm0, -16(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_5bytes_align):
	movl	-5(%eax), %ecx
	movl	-4(%eax), %eax
	movl	%ecx, -5(%edx)
	movl	%eax, -4(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_45bytes_align):
	movdqa	-45(%eax), %xmm0
	movdqa	%xmm0, -45(%edx)
L(fwd_write_29bytes_align):
	movdqa	-29(%eax), %xmm0
	movdqa	%xmm0, -29(%edx)
L(fwd_write_13bytes_align):
	movq	-13(%eax), %xmm0
	movq	%xmm0, -13(%edx)
	movl	-5(%eax), %ecx
	movl	%ecx, -5(%edx)
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_37bytes_align):
	movdqa	-37(%eax), %xmm0
	movdqa	%xmm0, -37(%edx)
L(fwd_write_21bytes_align):
	movdqa	-21(%eax), %xmm0
	movdqa	%xmm0, -21(%edx)
	movl	-5(%eax), %ecx
	movl	%ecx, -5(%edx)
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_41bytes_align):
	movdqa	-41(%eax), %xmm0
	movdqa	%xmm0, -41(%edx)
L(fwd_write_25bytes_align):
	movdqa	-25(%eax), %xmm0
	movdqa	%xmm0, -25(%edx)
L(fwd_write_9bytes_align):
	movq	-9(%eax), %xmm0
	movq	%xmm0, -9(%edx)
L(fwd_write_1bytes_align):
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_33bytes_align):
	movdqa	-33(%eax), %xmm0
	movdqa	%xmm0, -33(%edx)
L(fwd_write_17bytes_align):
	movdqa	-17(%eax), %xmm0
	movdqa	%xmm0, -17(%edx)
	movzbl	-1(%eax), %ecx
	movb	%cl, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_46bytes_align):
	movdqa	-46(%eax), %xmm0
	movdqa	%xmm0, -46(%edx)
L(fwd_write_30bytes_align):
	movdqa	-30(%eax), %xmm0
	movdqa	%xmm0, -30(%edx)
L(fwd_write_14bytes_align):
	movq	-14(%eax), %xmm0
	movq	%xmm0, -14(%edx)
L(fwd_write_6bytes_align):
	movl	-6(%eax), %ecx
	movl	%ecx, -6(%edx)
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_38bytes_align):
	movdqa	-38(%eax), %xmm0
	movdqa	%xmm0, -38(%edx)
L(fwd_write_22bytes_align):
	movdqa	-22(%eax), %xmm0
	movdqa	%xmm0, -22(%edx)
	movl	-6(%eax), %ecx
	movl	%ecx, -6(%edx)
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_42bytes_align):
	movdqa	-42(%eax), %xmm0
	movdqa	%xmm0, -42(%edx)
L(fwd_write_26bytes_align):
	movdqa	-26(%eax), %xmm0
	movdqa	%xmm0, -26(%edx)
L(fwd_write_10bytes_align):
	movq	-10(%eax), %xmm0
	movq	%xmm0, -10(%edx)
L(fwd_write_2bytes_align):
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_34bytes_align):
	movdqa	-34(%eax), %xmm0
	movdqa	%xmm0, -34(%edx)
L(fwd_write_18bytes_align):
	movdqa	-18(%eax), %xmm0
	movdqa	%xmm0, -18(%edx)
	movzwl	-2(%eax), %ecx
	movw	%cx, -2(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_47bytes_align):
	movdqa	-47(%eax), %xmm0
	movdqa	%xmm0, -47(%edx)
L(fwd_write_31bytes_align):
	movdqa	-31(%eax), %xmm0
	movdqa	%xmm0, -31(%edx)
L(fwd_write_15bytes_align):
	movq	-15(%eax), %xmm0
	movq	%xmm0, -15(%edx)
L(fwd_write_7bytes_align):
	movl	-7(%eax), %ecx
	movl	%ecx, -7(%edx)
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_39bytes_align):
	movdqa	-39(%eax), %xmm0
	movdqa	%xmm0, -39(%edx)
L(fwd_write_23bytes_align):
	movdqa	-23(%eax), %xmm0
	movdqa	%xmm0, -23(%edx)
	movl	-7(%eax), %ecx
	movl	%ecx, -7(%edx)
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_43bytes_align):
	movdqa	-43(%eax), %xmm0
	movdqa	%xmm0, -43(%edx)
L(fwd_write_27bytes_align):
	movdqa	-27(%eax), %xmm0
	movdqa	%xmm0, -27(%edx)
L(fwd_write_11bytes_align):
	movq	-11(%eax), %xmm0
	movq	%xmm0, -11(%edx)
L(fwd_write_3bytes_align):
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_35bytes_align):
	movdqa	-35(%eax), %xmm0
	movdqa	%xmm0, -35(%edx)
L(fwd_write_19bytes_align):
	movdqa	-19(%eax), %xmm0
	movdqa	%xmm0, -19(%edx)
	movzwl	-3(%eax), %ecx
	movzbl	-1(%eax), %eax
	movw	%cx, -3(%edx)
	movb	%al, -1(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_44bytes_align):
	movdqa	-44(%eax), %xmm0
	movdqa	%xmm0, -44(%edx)
L(fwd_write_28bytes_align):
	movdqa	-28(%eax), %xmm0
	movdqa	%xmm0, -28(%edx)
L(fwd_write_12bytes_align):
	movq	-12(%eax), %xmm0
	movq	%xmm0, -12(%edx)
L(fwd_write_4bytes_align):
	movl	-4(%eax), %ecx
	movl	%ecx, -4(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN

	.p2align 4
L(fwd_write_36bytes_align):
	movdqa	-36(%eax), %xmm0
	movdqa	%xmm0, -36(%edx)
L(fwd_write_20bytes_align):
	movdqa	-20(%eax), %xmm0
	movdqa	%xmm0, -20(%edx)
	movl	-4(%eax), %ecx
	movl	%ecx, -4(%edx)
#ifdef USE_AS_MEMPCPY
	movl	%edx, %eax
#else
	movl	DEST(%esp), %eax
#endif
	RETURN_END

	CFI_PUSH (%edi)

	.p2align 4
L(large_page):
	movdqu	(%eax), %xmm1
#ifdef USE_AS_MEMMOVE
	movl	DEST+4(%esp), %edi
	movdqu	%xmm0, (%edi)
#endif
	lea	16(%eax), %eax
	movntdq	%xmm1, (%edx)
	lea	16(%edx), %edx
	lea	-0x90(%ecx), %ecx
	POP (%edi)

	.p2align 4
L(large_page_loop):
	movdqu	(%eax), %xmm0
	movdqu	0x10(%eax), %xmm1
	movdqu	0x20(%eax), %xmm2
	movdqu	0x30(%eax), %xmm3
	movdqu	0x40(%eax), %xmm4
	movdqu	0x50(%eax), %xmm5
	movdqu	0x60(%eax), %xmm6
	movdqu	0x70(%eax), %xmm7
	lea	0x80(%eax), %eax

	sub	$0x80, %ecx
	movntdq	%xmm0, (%edx)
	movntdq	%xmm1, 0x10(%edx)
	movntdq	%xmm2, 0x20(%edx)
	movntdq	%xmm3, 0x30(%edx)
	movntdq	%xmm4, 0x40(%edx)
	movntdq	%xmm5, 0x50(%edx)
	movntdq	%xmm6, 0x60(%edx)
	movntdq	%xmm7, 0x70(%edx)
	lea	0x80(%edx), %edx
	jae	L(large_page_loop)
	cmp	$-0x40, %ecx
	lea	0x80(%ecx), %ecx
	jl	L(large_page_less_64bytes)

	movdqu	(%eax), %xmm0
	movdqu	0x10(%eax), %xmm1
	movdqu	0x20(%eax), %xmm2
	movdqu	0x30(%eax), %xmm3
	lea	0x40(%eax), %eax

	movntdq	%xmm0, (%edx)
	movntdq	%xmm1, 0x10(%edx)
	movntdq	%xmm2, 0x20(%edx)
	movntdq	%xmm3, 0x30(%edx)
	lea	0x40(%edx), %edx
	sub	$0x40, %ecx
L(large_page_less_64bytes):
	cmp	$32, %ecx
	jb	L(large_page_less_32bytes)
	movdqu	(%eax), %xmm0
	movdqu	0x10(%eax), %xmm1
	lea	0x20(%eax), %eax
	movntdq	%xmm0, (%edx)
	movntdq	%xmm1, 0x10(%edx)
	lea	0x20(%edx), %edx
	sub	$0x20, %ecx
L(large_page_less_32bytes):
	add	%ecx, %edx
	add	%ecx, %eax
	sfence
	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)

	.p2align 4
L(bk_write_44bytes):
	movq	36(%eax), %xmm0
	movq	%xmm0, 36(%edx)
L(bk_write_36bytes):
	movq	28(%eax), %xmm0
	movq	%xmm0, 28(%edx)
L(bk_write_28bytes):
	movq	20(%eax), %xmm0
	movq	%xmm0, 20(%edx)
L(bk_write_20bytes):
	movq	12(%eax), %xmm0
	movq	%xmm0, 12(%edx)
L(bk_write_12bytes):
	movq	4(%eax), %xmm0
	movq	%xmm0, 4(%edx)
L(bk_write_4bytes):
	movl	(%eax), %ecx
	movl	%ecx, (%edx)
L(bk_write_0bytes):
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_40bytes):
	movq	32(%eax), %xmm0
	movq	%xmm0, 32(%edx)
L(bk_write_32bytes):
	movq	24(%eax), %xmm0
	movq	%xmm0, 24(%edx)
L(bk_write_24bytes):
	movq	16(%eax), %xmm0
	movq	%xmm0, 16(%edx)
L(bk_write_16bytes):
	movq	8(%eax), %xmm0
	movq	%xmm0, 8(%edx)
L(bk_write_8bytes):
	movq	(%eax), %xmm0
	movq	%xmm0, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_45bytes):
	movq	37(%eax), %xmm0
	movq	%xmm0, 37(%edx)
L(bk_write_37bytes):
	movq	29(%eax), %xmm0
	movq	%xmm0, 29(%edx)
L(bk_write_29bytes):
	movq	21(%eax), %xmm0
	movq	%xmm0, 21(%edx)
L(bk_write_21bytes):
	movq	13(%eax), %xmm0
	movq	%xmm0, 13(%edx)
L(bk_write_13bytes):
	movq	5(%eax), %xmm0
	movq	%xmm0, 5(%edx)
L(bk_write_5bytes):
	movl	1(%eax), %ecx
	movl	%ecx, 1(%edx)
L(bk_write_1bytes):
	movzbl	(%eax), %ecx
	movb	%cl, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_41bytes):
	movq	33(%eax), %xmm0
	movq	%xmm0, 33(%edx)
L(bk_write_33bytes):
	movq	25(%eax), %xmm0
	movq	%xmm0, 25(%edx)
L(bk_write_25bytes):
	movq	17(%eax), %xmm0
	movq	%xmm0, 17(%edx)
L(bk_write_17bytes):
	movq	9(%eax), %xmm0
	movq	%xmm0, 9(%edx)
L(bk_write_9bytes):
	movq	1(%eax), %xmm0
	movq	%xmm0, 1(%edx)
	movzbl	(%eax), %ecx
	movb	%cl, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_46bytes):
	movq	38(%eax), %xmm0
	movq	%xmm0, 38(%edx)
L(bk_write_38bytes):
	movq	30(%eax), %xmm0
	movq	%xmm0, 30(%edx)
L(bk_write_30bytes):
	movq	22(%eax), %xmm0
	movq	%xmm0, 22(%edx)
L(bk_write_22bytes):
	movq	14(%eax), %xmm0
	movq	%xmm0, 14(%edx)
L(bk_write_14bytes):
	movq	6(%eax), %xmm0
	movq	%xmm0, 6(%edx)
L(bk_write_6bytes):
	movl	2(%eax), %ecx
	movl	%ecx, 2(%edx)
	movzwl	(%eax), %ecx
	movw	%cx, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_42bytes):
	movq	34(%eax), %xmm0
	movq	%xmm0, 34(%edx)
L(bk_write_34bytes):
	movq	26(%eax), %xmm0
	movq	%xmm0, 26(%edx)
L(bk_write_26bytes):
	movq	18(%eax), %xmm0
	movq	%xmm0, 18(%edx)
L(bk_write_18bytes):
	movq	10(%eax), %xmm0
	movq	%xmm0, 10(%edx)
L(bk_write_10bytes):
	movq	2(%eax), %xmm0
	movq	%xmm0, 2(%edx)
L(bk_write_2bytes):
	movzwl	(%eax), %ecx
	movw	%cx, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_47bytes):
	movq	39(%eax), %xmm0
	movq	%xmm0, 39(%edx)
L(bk_write_39bytes):
	movq	31(%eax), %xmm0
	movq	%xmm0, 31(%edx)
L(bk_write_31bytes):
	movq	23(%eax), %xmm0
	movq	%xmm0, 23(%edx)
L(bk_write_23bytes):
	movq	15(%eax), %xmm0
	movq	%xmm0, 15(%edx)
L(bk_write_15bytes):
	movq	7(%eax), %xmm0
	movq	%xmm0, 7(%edx)
L(bk_write_7bytes):
	movl	3(%eax), %ecx
	movl	%ecx, 3(%edx)
	movzwl	1(%eax), %ecx
	movw	%cx, 1(%edx)
	movzbl	(%eax), %eax
	movb	%al, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN

	.p2align 4
L(bk_write_43bytes):
	movq	35(%eax), %xmm0
	movq	%xmm0, 35(%edx)
L(bk_write_35bytes):
	movq	27(%eax), %xmm0
	movq	%xmm0, 27(%edx)
L(bk_write_27bytes):
	movq	19(%eax), %xmm0
	movq	%xmm0, 19(%edx)
L(bk_write_19bytes):
	movq	11(%eax), %xmm0
	movq	%xmm0, 11(%edx)
L(bk_write_11bytes):
	movq	3(%eax), %xmm0
	movq	%xmm0, 3(%edx)
L(bk_write_3bytes):
	movzwl	1(%eax), %ecx
	movw	%cx, 1(%edx)
	movzbl	(%eax), %eax
	movb	%al, (%edx)
	movl	DEST(%esp), %eax
#ifdef USE_AS_MEMPCPY
	movl	LEN(%esp), %ecx
	add	%ecx, %eax
#endif
	RETURN_END


	.pushsection .rodata.ssse3,"a",@progbits
	.p2align 2
L(table_48bytes_fwd):
	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))

	.p2align 2
L(table_48bytes_fwd_align):
	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))

	.p2align 2
L(shl_table):
	.int	JMPTBL (L(shl_0), L(shl_table))
	.int	JMPTBL (L(shl_1), L(shl_table))
	.int	JMPTBL (L(shl_2), L(shl_table))
	.int	JMPTBL (L(shl_3), L(shl_table))
	.int	JMPTBL (L(shl_4), L(shl_table))
	.int	JMPTBL (L(shl_5), L(shl_table))
	.int	JMPTBL (L(shl_6), L(shl_table))
	.int	JMPTBL (L(shl_7), L(shl_table))
	.int	JMPTBL (L(shl_8), L(shl_table))
	.int	JMPTBL (L(shl_9), L(shl_table))
	.int	JMPTBL (L(shl_10), L(shl_table))
	.int	JMPTBL (L(shl_11), L(shl_table))
	.int	JMPTBL (L(shl_12), L(shl_table))
	.int	JMPTBL (L(shl_13), L(shl_table))
	.int	JMPTBL (L(shl_14), L(shl_table))
	.int	JMPTBL (L(shl_15), L(shl_table))

	.p2align 2
L(table_48_bytes_bwd):
	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))

	.popsection

#ifdef USE_AS_MEMMOVE
	.p2align 4
L(copy_backward):
	PUSH (%edi)
	movl	%eax, %edi
	lea	(%ecx,%edx,1),%edx
	lea	(%ecx,%edi,1),%edi
	testl	$0x3, %edx
	jnz	L(bk_align)

L(bk_aligned_4):
	cmp	$64, %ecx
	jae	L(bk_write_more64bytes)

L(bk_write_64bytesless):
	cmp	$32, %ecx
	jb	L(bk_write_less32bytes)

L(bk_write_more32bytes):
	/* Copy 32 bytes at a time.  */
	sub	$32, %ecx
	movq	-8(%edi), %xmm0
	movq	%xmm0, -8(%edx)
	movq	-16(%edi), %xmm0
	movq	%xmm0, -16(%edx)
	movq	-24(%edi), %xmm0
	movq	%xmm0, -24(%edx)
	movq	-32(%edi), %xmm0
	movq	%xmm0, -32(%edx)
	sub	$32, %edx
	sub	$32, %edi

L(bk_write_less32bytes):
	movl	%edi, %eax
	sub	%ecx, %edx
	sub	%ecx, %eax
	POP (%edi)
L(bk_write_less32bytes_2):
	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)

	CFI_PUSH (%edi)

	.p2align 4
L(bk_align):
	cmp	$8, %ecx
	jbe	L(bk_write_less32bytes)
	testl	$1, %edx
	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
	then	(EDX & 2) must be != 0.  */
	jz	L(bk_got2)
	sub	$1, %edi
	sub	$1, %ecx
	sub	$1, %edx
	movzbl	(%edi), %eax
	movb	%al, (%edx)

	testl	$2, %edx
	jz	L(bk_aligned_4)

L(bk_got2):
	sub	$2, %edi
	sub	$2, %ecx
	sub	$2, %edx
	movzwl	(%edi), %eax
	movw	%ax, (%edx)
	jmp	L(bk_aligned_4)

	.p2align 4
L(bk_write_more64bytes):
	/* Check alignment of last byte.  */
	testl	$15, %edx
	jz	L(bk_ssse3_cpy_pre)

/* EDX is aligned 4 bytes, but not 16 bytes.  */
L(bk_ssse3_align):
	sub	$4, %edi
	sub	$4, %ecx
	sub	$4, %edx
	movl	(%edi), %eax
	movl	%eax, (%edx)

	testl	$15, %edx
	jz	L(bk_ssse3_cpy_pre)

	sub	$4, %edi
	sub	$4, %ecx
	sub	$4, %edx
	movl	(%edi), %eax
	movl	%eax, (%edx)

	testl	$15, %edx
	jz	L(bk_ssse3_cpy_pre)

	sub	$4, %edi
	sub	$4, %ecx
	sub	$4, %edx
	movl	(%edi), %eax
	movl	%eax, (%edx)

L(bk_ssse3_cpy_pre):
	cmp	$64, %ecx
	jb	L(bk_write_more32bytes)

	.p2align 4
L(bk_ssse3_cpy):
	sub	$64, %edi
	sub	$64, %ecx
	sub	$64, %edx
	movdqu	0x30(%edi), %xmm3
	movdqa	%xmm3, 0x30(%edx)
	movdqu	0x20(%edi), %xmm2
	movdqa	%xmm2, 0x20(%edx)
	movdqu	0x10(%edi), %xmm1
	movdqa	%xmm1, 0x10(%edx)
	movdqu	(%edi), %xmm0
	movdqa	%xmm0, (%edx)
	cmp	$64, %ecx
	jae	L(bk_ssse3_cpy)
	jmp	L(bk_write_64bytesless)

#endif

END (MEMCPY)