/*
Copyright (c) 2014, Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
    * this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright notice,
    * this list of conditions and the following disclaimer in the documentation
    * and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its contributors
    * may be used to endorse or promote products derived from this software
    * without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef L
# define L(label)	.L##label
#endif

#ifndef cfi_startproc
# define cfi_startproc	.cfi_startproc
#endif

#ifndef cfi_endproc
# define cfi_endproc	.cfi_endproc
#endif

#ifndef ENTRY
# define ENTRY(name)	\
	.type name, @function;	\
	.globl name;	\
	.p2align 4;	\
name:	\
	cfi_startproc
#endif

#ifndef END
# define END(name)	\
       cfi_endproc;	\
       .size name, .-name
#endif


#ifndef STRLCPY
# define STRLCPY	strlcpy
#endif

#define JMPTBL(I, B)	I - B
#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
	lea	TABLE(%rip), %r11;	\
	movslq	(%r11, INDEX, SCALE), %rcx;	\
	lea	(%r11, %rcx), %rcx;	\
	jmp	*%rcx

#define RETURN	\
	add	%r9, %rax;	\
	ret

.text
ENTRY (STRLCPY)
	xor	%rax, %rax
	xor	%r9, %r9
	mov	%rdx, %r8
	cmp	$0, %r8
	jz	L(CalculateSrcLen)

#ifdef USE_AS_STRLCAT
	xor	%rcx, %rcx
	pxor	%xmm0, %xmm0

	movdqu	(%rdi), %xmm1
	pcmpeqb %xmm1, %xmm0
	pmovmskb %xmm0, %rdx

	cmp	$17, %r8
	jb	L(SizeEndCase1)
	test	%rdx, %rdx
	jnz	L(StringEndCase1)

	add	$16, %rax
	movdqu	16(%rdi), %xmm1
	pcmpeqb %xmm1, %xmm0
	pmovmskb %xmm0, %rdx

	cmp	$33, %r8
	jb	L(SizeEndCase1)
	test	%rdx, %rdx
	jnz	L(StringEndCase1)

	mov	%rdi, %rcx
	and	$15, %rcx
	and	$-16, %rdi

	add	%rcx, %r8
	sub	$16, %r8

L(DstLenLoop):
	movdqa	(%rdi, %rax), %xmm1
	pcmpeqb %xmm1, %xmm0
	pmovmskb %xmm0, %rdx
	sub	$16, %r8
	jbe	L(SizeEndCase2)
	test	%rdx, %rdx
	jnz	L(StringEndCase2)
	add	$16, %rax
	jmp	L(DstLenLoop)

L(StringEndCase2):
	add	$16, %r8
	bsf	%rdx, %rdx
	sub	%rdx, %r8
	add	%rdx, %rax
	sub	%rcx, %r9
	add	%rax, %rdi
	jmp	 L(CopySrcString)

L(SizeEndCase1):
	test	%rdx, %rdx
	jz	L(SizeEnd)
	bsf	%rdx, %rdx
	add	%rdx, %rax
	cmp	%r8, %rax
	jb	L(StringEnd)
L(SizeEnd):
	mov	%r8, %r9
	jmp	L(CalculateSrcLenCase1)

L(SizeEndCase2):
	add	$16, %r8
	test	%rdx, %rdx
	jz	L(StringEndCase4)
	bsf	%rdx, %rdx
	cmp	%r8, %rdx
	jb	L(StringEndCase3)
L(StringEndCase4):
	add	%r8, %rax
	sub	%rcx, %rax
	mov	%rax, %r9
	jmp	L(CalculateSrcLenCase1)

L(StringEndCase3):
	add	%rdx, %rax
	sub	%rcx, %r9
	add	%rax, %rdi
	sub	%rdx, %r8
	jmp	L(CopySrcString)

L(StringEndCase1):
	bsf	%rdx, %rdx
	add	%rdx, %rax
	sub	%rcx, %rax
L(StringEnd):
	add	%rax, %rdi
	sub	%rax, %r8
#endif

	mov	%rsi, %rcx
	and	$63, %rcx
	cmp	$32, %rcx
	jbe	L(CopySrcString)

	and	$-16, %rsi
	and	$15, %rcx
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1

	pcmpeqb	(%rsi), %xmm1
	pmovmskb %xmm1, %rdx
	shr	%cl, %rdx
	mov	$16, %r10
	sub	%rcx, %r10
	cmp	%r10, %r8
	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesTail)

	pcmpeqb	16(%rsi), %xmm0
	pmovmskb %xmm0, %rdx
	add	$16, %r10
	cmp	%r10, %r8
	jbe	L(CopyFrom1To32BytesCase2OrCase3)
	test	%rdx, %rdx
	jnz	L(CopyFrom1To32Bytes)

	movdqu	(%rsi, %rcx), %xmm1
	movdqu	%xmm1, (%rdi)
#ifdef USE_AS_STRLCAT
	add	%rax, %r9
#endif
	jmp	L(LoopStart)

	.p2align 4
L(CopySrcString):
#ifdef USE_AS_STRLCAT
	add	%rax, %r9
	xor	%rax, %rax
#endif
	pxor	%xmm0, %xmm0
	movdqu	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %rdx

	cmp	$17, %r8
	jb	L(CopyFrom1To16BytesTail1Case2OrCase3)
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesTail1)

	movdqu	16(%rsi), %xmm2
	pcmpeqb	%xmm2, %xmm0
	movdqu	%xmm1, (%rdi)
	pmovmskb %xmm0, %rdx
	add	$16, %rax

	cmp	$33, %r8
	jb	L(CopyFrom1To32Bytes1Case2OrCase3)
	test	%rdx, %rdx
	jnz	L(CopyFrom1To32Bytes1)

	mov	%rsi, %rcx
	and	$15, %rcx
	and	$-16, %rsi

L(LoopStart):
	sub	%rcx, %rdi
	add	%rcx, %r8
	sub	$16, %r8
	mov	$16, %rax

L(16Loop):
	movdqa	(%rsi, %rax), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %rdx
	sub	$16, %r8
	jbe	L(CopyFrom1To16BytesCase2OrCase3)
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesXmmExit)
	movdqu	%xmm1, (%rdi, %rax)
	add	$16, %rax
	jmp	L(16Loop)

/*------End of main part with loops---------------------*/

/* Case1 */
	.p2align 4
L(CopyFrom1To16Bytes):
	add	%rcx, %rdi
	add	%rcx, %rsi
	bsf	%rdx, %rdx
	add	%rdx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

	.p2align 4
L(CopyFrom1To16BytesTail):
	add	%rcx, %rsi
	bsf	%rdx, %rdx
	add	%rdx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

	.p2align 4
L(CopyFrom1To32Bytes1):
	add	$16, %rsi
	add	$16, %rdi
	sub	$16, %r8
L(CopyFrom1To16BytesTail1):
	bsf	%rdx, %rdx
	add	%rdx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

	.p2align 4
L(CopyFrom1To32Bytes):
	bsf	%rdx, %rdx
	add	%rcx, %rsi
	add	$16, %rdx
	sub	%rcx, %rdx
	add	%rdx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

	.p2align 4
L(CopyFrom1To16BytesExit):
	add	%rdx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

/* Case2 */

	.p2align 4
L(CopyFrom1To16BytesCase2):
	add	$16, %r8
	add	%rax, %rdi
	add	%rax, %rsi
	bsf	%rdx, %rdx
	sub	%rcx, %rax
	cmp	%r8, %rdx
	jb	L(CopyFrom1To16BytesExit)
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To32BytesCase2):
	add	%rcx, %rsi
	bsf	%rdx, %rdx
	add	$16, %rdx
	sub	%rcx, %rdx
	cmp	%r8, %rdx
	jb	L(CopyFrom1To16BytesExit)
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

L(CopyFrom1To16BytesTailCase2):
	add	%rcx, %rsi
	bsf	%rdx, %rdx
	cmp	%r8, %rdx
	jb	L(CopyFrom1To16BytesExit)
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To16BytesTail1Case2):
	bsf	%rdx, %rdx
	cmp	%r8, %rdx
	jb	L(CopyFrom1To16BytesExit)
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

/* Case2 or Case3,  Case3 */

	.p2align 4
L(CopyFrom1To16BytesCase2OrCase3):
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesCase2)
	add	$16, %r8
	add	%rax, %rdi
	add	%rax, %rsi
	add	%r8, %rax
	sub	%rcx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To32BytesCase2OrCase3):
	test	%rdx, %rdx
	jnz	L(CopyFrom1To32BytesCase2)
	add	%rcx, %rsi
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To16BytesTailCase2OrCase3):
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesTailCase2)
	add	%rcx, %rsi
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To32Bytes1Case2OrCase3):
	add	$16, %rdi
	add	$16, %rsi
	sub	$16, %r8
L(CopyFrom1To16BytesTail1Case2OrCase3):
	test	%rdx, %rdx
	jnz	L(CopyFrom1To16BytesTail1Case2)
	add	%r8, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)

	.p2align 4
L(CopyFrom1To16BytesXmmExit):
	bsf	%rdx, %rdx
	add	%rax, %rdi
	add	%rax, %rsi
	add	%rdx, %rax
	sub	%rcx, %rax
	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)

/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/


	.p2align 4
L(Exit0):
	RETURN

	.p2align 4
L(Exit1):
	movb	$0, (%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit2):
	movb	(%rsi), %dh
	movb	%dh, (%rdi)
	movb	$0, 1(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit3):
	movw	(%rsi), %dx
	movw	%dx, (%rdi)
	movb	$0, 2(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit4):
	movw	(%rsi), %cx
	movb	2(%rsi), %dh
	movw	%cx, (%rdi)
	movb	%dh, 2(%rdi)
	movb	$0, 3(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit5):
	movl	(%rsi), %edx
	movl	%edx, (%rdi)
	movb	$0, 4(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit6):
	movl	(%rsi), %ecx
	movb	4(%rsi), %dh
	movl	%ecx, (%rdi)
	movb	%dh, 4(%rdi)
	movb	$0, 5(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit7):
	movl	(%rsi), %ecx
	movw	4(%rsi), %dx
	movl	%ecx, (%rdi)
	movw	%dx, 4(%rdi)
	movb	$0, 6(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit8):
	movl	(%rsi), %ecx
	movl	3(%rsi), %edx
	movl	%ecx, (%rdi)
	movl	%edx, 3(%rdi)
	movb	$0, 7(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit9):
	movq	(%rsi), %rdx
	movq	%rdx, (%rdi)
	movb	$0, 8(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit10):
	movq	(%rsi), %rcx
	movb	8(%rsi), %dh
	movq	%rcx, (%rdi)
	movb	%dh, 8(%rdi)
	movb	$0, 9(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit11):
	movq	(%rsi), %rcx
	movw	8(%rsi), %dx
	movq	%rcx, (%rdi)
	movw	%dx, 8(%rdi)
	movb	$0, 10(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit12):
	movq	(%rsi), %rcx
	movl	7(%rsi), %edx
	movq	%rcx, (%rdi)
	movl	%edx, 7(%rdi)
	movb	$0, 11(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit13):
	movq	(%rsi), %rcx
	movl	8(%rsi), %edx
	movq	%rcx, (%rdi)
	movl	%edx, 8(%rdi)
	movb	$0, 12(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit14):
	movq	(%rsi), %rcx
	movq	5(%rsi), %rdx
	movq	%rcx, (%rdi)
	movq	%rdx, 5(%rdi)
	movb	$0, 13(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit15):
	movq	(%rsi), %rcx
	movq	6(%rsi), %rdx
	movq	%rcx, (%rdi)
	movq	%rdx, 6(%rdi)
	movb	$0, 14(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit16):
	movq	(%rsi), %rcx
	movq	7(%rsi), %rdx
	movq	%rcx, (%rdi)
	movq	%rdx, 7(%rdi)
	movb	$0, 15(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit17):
	movdqu	(%rsi), %xmm0
	movdqu	%xmm0, (%rdi)
	movb	$0, 16(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit18):
	movdqu	(%rsi), %xmm0
	movb	16(%rsi), %dh
	movdqu	%xmm0, (%rdi)
	movb	%dh, 16(%rdi)
	movb	$0, 17(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit19):
	movdqu	(%rsi), %xmm0
	movw	16(%rsi), %cx
	movdqu	%xmm0, (%rdi)
	movw	%cx, 16(%rdi)
	movb	$0, 18(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit20):
	movdqu	(%rsi), %xmm0
	movl	15(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	movl	%ecx, 15(%rdi)
	movb	$0, 19(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit21):
	movdqu	(%rsi), %xmm0
	movl	16(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	movl	%ecx, 16(%rdi)
	movb	$0, 20(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit22):
	movdqu	(%rsi), %xmm0
	movl	16(%rsi), %ecx
	movb	20(%rsi), %dh
	movdqu	%xmm0, (%rdi)
	movl	%ecx, 16(%rdi)
	movb	%dh, 20(%rdi)
	movb	$0, 21(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit23):
	movdqu	(%rsi), %xmm0
	movq	14(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	movq	%rcx, 14(%rdi)
	movb	$0, 22(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit24):
	movdqu	(%rsi), %xmm0
	movq	15(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	movq	%rcx, 15(%rdi)
	movb	$0, 23(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit25):
	movdqu	(%rsi), %xmm0
	movq	16(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	movq	%rcx, 16(%rdi)
	movb	$0, 24(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit26):
	movdqu	(%rsi), %xmm0
	movq	16(%rsi), %rcx
	movb	24(%rsi), %dh
	movdqu	%xmm0, (%rdi)
	movq	%rcx, 16(%rdi)
	mov	%dh, 24(%rdi)
	movb	$0, 25(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit27):
	movdqu	(%rsi), %xmm0
	movq	16(%rsi), %rdx
	movw	24(%rsi), %cx
	movdqu	%xmm0, (%rdi)
	movq	%rdx, 16(%rdi)
	movw	%cx, 24(%rdi)
	movb	$0, 26(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit28):
	movdqu	(%rsi), %xmm0
	movq	16(%rsi), %rdx
	movl	23(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	movq	%rdx, 16(%rdi)
	movl	%ecx, 23(%rdi)
	movb	$0, 27(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit29):
	movdqu	(%rsi), %xmm0
	movq	16(%rsi), %rdx
	movl	24(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	movq	%rdx, 16(%rdi)
	movl	%ecx, 24(%rdi)
	movb	$0, 28(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit30):
	movdqu	(%rsi), %xmm0
	movdqu	13(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 13(%rdi)
	movb	$0, 29(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit31):
	movdqu	(%rsi), %xmm0
	movdqu	14(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 14(%rdi)
	movb	$0, 30(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(Exit32):
	movdqu	(%rsi), %xmm0
	movdqu	15(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 15(%rdi)
	movb	$0, 31(%rdi)
	jmp	L(CalculateSrcLen)

	.p2align 4
L(StringTail0):
	mov	(%rsi), %dl
	mov	%dl, (%rdi)
	RETURN

	.p2align 4
L(StringTail1):
	mov	(%rsi), %dx
	mov	%dx, (%rdi)
	RETURN

	.p2align 4
L(StringTail2):
	mov	(%rsi), %cx
	mov	2(%rsi), %dl
	mov	%cx, (%rdi)
	mov	%dl, 2(%rdi)
	RETURN

	.p2align 4
L(StringTail3):
	mov	(%rsi), %edx
	mov	%edx, (%rdi)
	RETURN

	.p2align 4
L(StringTail4):
	mov	(%rsi), %ecx
	mov	4(%rsi), %dl
	mov	%ecx, (%rdi)
	mov	%dl, 4(%rdi)
	RETURN

	.p2align 4
L(StringTail5):
	mov	(%rsi), %ecx
	mov	4(%rsi), %dx
	mov	%ecx, (%rdi)
	mov	%dx, 4(%rdi)
	RETURN

	.p2align 4
L(StringTail6):
	mov	(%rsi), %ecx
	mov	3(%rsi), %edx
	mov	%ecx, (%rdi)
	mov	%edx, 3(%rdi)
	RETURN

	.p2align 4
L(StringTail7):
	mov	(%rsi), %rdx
	mov	%rdx, (%rdi)
	RETURN

	.p2align 4
L(StringTail8):
	mov	(%rsi), %rcx
	mov	8(%rsi), %dl
	mov	%rcx, (%rdi)
	mov	%dl, 8(%rdi)
	RETURN

	.p2align 4
L(StringTail9):
	mov	(%rsi), %rcx
	mov	8(%rsi), %dx
	mov	%rcx, (%rdi)
	mov	%dx, 8(%rdi)
	RETURN

	.p2align 4
L(StringTail10):
	mov	(%rsi), %rcx
	mov	7(%rsi), %edx
	mov	%rcx, (%rdi)
	mov	%edx, 7(%rdi)
	RETURN

	.p2align 4
L(StringTail11):
	mov	(%rsi), %rcx
	mov	8(%rsi), %edx
	mov	%rcx, (%rdi)
	mov	%edx, 8(%rdi)
	RETURN

	.p2align 4
L(StringTail12):
	mov	(%rsi), %rcx
	mov	5(%rsi), %rdx
	mov	%rcx, (%rdi)
	mov	%rdx, 5(%rdi)
	RETURN

	.p2align 4
L(StringTail13):
	mov	(%rsi), %rcx
	mov	6(%rsi), %rdx
	mov	%rcx, (%rdi)
	mov	%rdx, 6(%rdi)
	RETURN

	.p2align 4
L(StringTail14):
	mov	(%rsi), %rcx
	mov	7(%rsi), %rdx
	mov	%rcx, (%rdi)
	mov	%rdx, 7(%rdi)
	RETURN

	.p2align 4
L(StringTail15):
	movdqu	(%rsi), %xmm0
	movdqu	%xmm0, (%rdi)
	RETURN

	.p2align 4
L(StringTail16):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %cl
	movdqu	%xmm0, (%rdi)
	mov	%cl, 16(%rdi)
	RETURN

	.p2align 4
L(StringTail17):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %cx
	movdqu	%xmm0, (%rdi)
	mov	%cx, 16(%rdi)
	RETURN

	.p2align 4
L(StringTail18):
	movdqu	(%rsi), %xmm0
	mov	15(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	mov	%ecx, 15(%rdi)
	RETURN

	.p2align 4
L(StringTail19):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	mov	%ecx, 16(%rdi)
	RETURN

	.p2align 4
L(StringTail20):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %ecx
	mov	20(%rsi), %dl
	movdqu	%xmm0, (%rdi)
	mov	%ecx, 16(%rdi)
	mov	%dl, 20(%rdi)
	RETURN

	.p2align 4
L(StringTail21):
	movdqu	(%rsi), %xmm0
	mov	14(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	mov	%rcx, 14(%rdi)
	RETURN

	.p2align 4
L(StringTail22):
	movdqu	(%rsi), %xmm0
	mov	15(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	mov	%rcx, 15(%rdi)
	RETURN

	.p2align 4
L(StringTail23):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %rcx
	movdqu	%xmm0, (%rdi)
	mov	%rcx, 16(%rdi)
	RETURN

	.p2align 4
L(StringTail24):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %rdx
	mov	24(%rsi), %cl
	movdqu	%xmm0, (%rdi)
	mov	%rdx, 16(%rdi)
	mov	%cl, 24(%rdi)
	RETURN

	.p2align 4
L(StringTail25):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %rdx
	mov	24(%rsi), %cx
	movdqu	%xmm0, (%rdi)
	mov	%rdx, 16(%rdi)
	mov	%cx, 24(%rdi)
	RETURN

	.p2align 4
L(StringTail26):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %rdx
	mov	23(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	mov	%rdx, 16(%rdi)
	mov	%ecx, 23(%rdi)
	RETURN

	.p2align 4
L(StringTail27):
	movdqu	(%rsi), %xmm0
	mov	16(%rsi), %rdx
	mov	24(%rsi), %ecx
	movdqu	%xmm0, (%rdi)
	mov	%rdx, 16(%rdi)
	mov	%ecx, 24(%rdi)
	RETURN

	.p2align 4
L(StringTail28):
	movdqu	(%rsi), %xmm0
	movdqu	13(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 13(%rdi)
	RETURN

	.p2align 4
L(StringTail29):
	movdqu	(%rsi), %xmm0
	movdqu	14(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 14(%rdi)
	RETURN

	.p2align 4
L(StringTail30):
	movdqu	(%rsi), %xmm0
	movdqu	15(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 15(%rdi)
	RETURN

	.p2align 4
L(StringTail31):
	movdqu	(%rsi), %xmm0
	movdqu	16(%rsi), %xmm2
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 16(%rdi)
	RETURN

	.p2align 4
L(StringTail32):
	movdqu	(%rsi), %xmm0
	movdqu	16(%rsi), %xmm2
	mov	32(%rsi), %cl
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 16(%rdi)
	mov	%cl, 32(%rdi)
	RETURN

	.p2align 4
L(StringTail33):
	movdqu	(%rsi), %xmm0
	movdqu	16(%rsi), %xmm2
	mov	32(%rsi), %cl
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm2, 16(%rdi)
	mov	%cl, 32(%rdi)
	RETURN

	.p2align 4
L(CalculateSrcLenCase1):
	xor	%r8, %r8
	xor	%rax, %rax
L(CalculateSrcLen):
	pxor	%xmm0, %xmm0
	xor	%rcx, %rcx
	add	%r8, %rsi
	movdqu	(%rsi), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %rdx
	test	%rdx, %rdx
	jnz	L(SrcLenLoopEnd)

	add	%rax, %r9
	mov	$16, %rax
	mov	%rsi, %rcx
	and	$15, %rcx
	and	$-16, %rsi
L(SrcLenLoop):
	movdqa	(%rsi, %rax), %xmm1
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %rdx
	test	%rdx, %rdx
	jnz	L(SrcLenLoopEnd)
	add	$16, %rax
	jmp	L(SrcLenLoop)

	.p2align 4
L(SrcLenLoopEnd):
	bsf	%rdx, %rdx
	add	%rdx, %rax
	sub	%rcx, %rax
	RETURN

END (STRLCPY)

	.p2align 4
	.section .rodata
L(ExitTable):
	.int	JMPTBL(L(Exit0), L(ExitTable))
	.int	JMPTBL(L(Exit1), L(ExitTable))
	.int	JMPTBL(L(Exit2), L(ExitTable))
	.int	JMPTBL(L(Exit3), L(ExitTable))
	.int	JMPTBL(L(Exit4), L(ExitTable))
	.int	JMPTBL(L(Exit5), L(ExitTable))
	.int	JMPTBL(L(Exit6), L(ExitTable))
	.int	JMPTBL(L(Exit7), L(ExitTable))
	.int	JMPTBL(L(Exit8), L(ExitTable))
	.int	JMPTBL(L(Exit9), L(ExitTable))
	.int	JMPTBL(L(Exit10), L(ExitTable))
	.int	JMPTBL(L(Exit11), L(ExitTable))
	.int	JMPTBL(L(Exit12), L(ExitTable))
	.int	JMPTBL(L(Exit13), L(ExitTable))
	.int	JMPTBL(L(Exit14), L(ExitTable))
	.int	JMPTBL(L(Exit15), L(ExitTable))
	.int	JMPTBL(L(Exit16), L(ExitTable))
	.int	JMPTBL(L(Exit17), L(ExitTable))
	.int	JMPTBL(L(Exit18), L(ExitTable))
	.int	JMPTBL(L(Exit19), L(ExitTable))
	.int	JMPTBL(L(Exit20), L(ExitTable))
	.int	JMPTBL(L(Exit21), L(ExitTable))
	.int	JMPTBL(L(Exit22), L(ExitTable))
	.int	JMPTBL(L(Exit23), L(ExitTable))
	.int	JMPTBL(L(Exit24), L(ExitTable))
	.int	JMPTBL(L(Exit25), L(ExitTable))
	.int	JMPTBL(L(Exit26), L(ExitTable))
	.int	JMPTBL(L(Exit27), L(ExitTable))
	.int	JMPTBL(L(Exit28), L(ExitTable))
	.int	JMPTBL(L(Exit29), L(ExitTable))
	.int	JMPTBL(L(Exit30), L(ExitTable))
	.int	JMPTBL(L(Exit31), L(ExitTable))
	.int	JMPTBL(L(Exit32), L(ExitTable))
L(ExitStringTailTable):
	.int	JMPTBL(L(StringTail0), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail1), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail2), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail3), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail4), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail5), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail6), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail7), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail8), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail9), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail10), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail11), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail12), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail13), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail14), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail15), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail16), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail17), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail18), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail19), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail20), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail21), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail22), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail23), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail24), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail25), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail26), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail27), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail28), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail29), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail30), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail31), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail32), L(ExitStringTailTable))
	.int	JMPTBL(L(StringTail33), L(ExitStringTailTable))