default	rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section	.text code align=64



ALIGN	32
_aesni_ctr32_ghash_6x:
	vmovdqu	xmm2,XMMWORD[32+r11]
	sub	rdx,6
	vpxor	xmm4,xmm4,xmm4
	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
	vpaddb	xmm10,xmm1,xmm2
	vpaddb	xmm11,xmm10,xmm2
	vpaddb	xmm12,xmm11,xmm2
	vpaddb	xmm13,xmm12,xmm2
	vpaddb	xmm14,xmm13,xmm2
	vpxor	xmm9,xmm1,xmm15
	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
	jmp	NEAR $L$oop6x

ALIGN	32
$L$oop6x:
	add	ebx,100663296
	jc	NEAR $L$handle_ctr32
	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
	vpaddb	xmm1,xmm14,xmm2
	vpxor	xmm10,xmm10,xmm15
	vpxor	xmm11,xmm11,xmm15

$L$resume_ctr32:
	vmovdqu	XMMWORD[r8],xmm1
	vpclmulqdq	xmm5,xmm7,xmm3,0x10
	vpxor	xmm12,xmm12,xmm15
	vmovups	xmm2,XMMWORD[((16-128))+rcx]
	vpclmulqdq	xmm6,xmm7,xmm3,0x01

















	xor	r12,r12
	cmp	r15,r14

	vaesenc	xmm9,xmm9,xmm2
	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
	vpxor	xmm13,xmm13,xmm15
	vpclmulqdq	xmm1,xmm7,xmm3,0x00
	vaesenc	xmm10,xmm10,xmm2
	vpxor	xmm14,xmm14,xmm15
	setnc	r12b
	vpclmulqdq	xmm7,xmm7,xmm3,0x11
	vaesenc	xmm11,xmm11,xmm2
	vmovdqu	xmm3,XMMWORD[((16-32))+r9]
	neg	r12
	vaesenc	xmm12,xmm12,xmm2
	vpxor	xmm6,xmm6,xmm5
	vpclmulqdq	xmm5,xmm0,xmm3,0x00
	vpxor	xmm8,xmm8,xmm4
	vaesenc	xmm13,xmm13,xmm2
	vpxor	xmm4,xmm1,xmm5
	and	r12,0x60
	vmovups	xmm15,XMMWORD[((32-128))+rcx]
	vpclmulqdq	xmm1,xmm0,xmm3,0x10
	vaesenc	xmm14,xmm14,xmm2

	vpclmulqdq	xmm2,xmm0,xmm3,0x01
	lea	r14,[r12*1+r14]
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
	vpclmulqdq	xmm3,xmm0,xmm3,0x11
	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
	vaesenc	xmm10,xmm10,xmm15
	movbe	r13,QWORD[88+r14]
	vaesenc	xmm11,xmm11,xmm15
	movbe	r12,QWORD[80+r14]
	vaesenc	xmm12,xmm12,xmm15
	mov	QWORD[((32+8))+rsp],r13
	vaesenc	xmm13,xmm13,xmm15
	mov	QWORD[((40+8))+rsp],r12
	vmovdqu	xmm5,XMMWORD[((48-32))+r9]
	vaesenc	xmm14,xmm14,xmm15

	vmovups	xmm15,XMMWORD[((48-128))+rcx]
	vpxor	xmm6,xmm6,xmm1
	vpclmulqdq	xmm1,xmm0,xmm5,0x00
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm6,xmm6,xmm2
	vpclmulqdq	xmm2,xmm0,xmm5,0x10
	vaesenc	xmm10,xmm10,xmm15
	vpxor	xmm7,xmm7,xmm3
	vpclmulqdq	xmm3,xmm0,xmm5,0x01
	vaesenc	xmm11,xmm11,xmm15
	vpclmulqdq	xmm5,xmm0,xmm5,0x11
	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
	vaesenc	xmm12,xmm12,xmm15
	vaesenc	xmm13,xmm13,xmm15
	vpxor	xmm4,xmm4,xmm1
	vmovdqu	xmm1,XMMWORD[((64-32))+r9]
	vaesenc	xmm14,xmm14,xmm15

	vmovups	xmm15,XMMWORD[((64-128))+rcx]
	vpxor	xmm6,xmm6,xmm2
	vpclmulqdq	xmm2,xmm0,xmm1,0x00
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm6,xmm6,xmm3
	vpclmulqdq	xmm3,xmm0,xmm1,0x10
	vaesenc	xmm10,xmm10,xmm15
	movbe	r13,QWORD[72+r14]
	vpxor	xmm7,xmm7,xmm5
	vpclmulqdq	xmm5,xmm0,xmm1,0x01
	vaesenc	xmm11,xmm11,xmm15
	movbe	r12,QWORD[64+r14]
	vpclmulqdq	xmm1,xmm0,xmm1,0x11
	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
	vaesenc	xmm12,xmm12,xmm15
	mov	QWORD[((48+8))+rsp],r13
	vaesenc	xmm13,xmm13,xmm15
	mov	QWORD[((56+8))+rsp],r12
	vpxor	xmm4,xmm4,xmm2
	vmovdqu	xmm2,XMMWORD[((96-32))+r9]
	vaesenc	xmm14,xmm14,xmm15

	vmovups	xmm15,XMMWORD[((80-128))+rcx]
	vpxor	xmm6,xmm6,xmm3
	vpclmulqdq	xmm3,xmm0,xmm2,0x00
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm6,xmm6,xmm5
	vpclmulqdq	xmm5,xmm0,xmm2,0x10
	vaesenc	xmm10,xmm10,xmm15
	movbe	r13,QWORD[56+r14]
	vpxor	xmm7,xmm7,xmm1
	vpclmulqdq	xmm1,xmm0,xmm2,0x01
	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
	vaesenc	xmm11,xmm11,xmm15
	movbe	r12,QWORD[48+r14]
	vpclmulqdq	xmm2,xmm0,xmm2,0x11
	vaesenc	xmm12,xmm12,xmm15
	mov	QWORD[((64+8))+rsp],r13
	vaesenc	xmm13,xmm13,xmm15
	mov	QWORD[((72+8))+rsp],r12
	vpxor	xmm4,xmm4,xmm3
	vmovdqu	xmm3,XMMWORD[((112-32))+r9]
	vaesenc	xmm14,xmm14,xmm15

	vmovups	xmm15,XMMWORD[((96-128))+rcx]
	vpxor	xmm6,xmm6,xmm5
	vpclmulqdq	xmm5,xmm8,xmm3,0x10
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm6,xmm6,xmm1
	vpclmulqdq	xmm1,xmm8,xmm3,0x01
	vaesenc	xmm10,xmm10,xmm15
	movbe	r13,QWORD[40+r14]
	vpxor	xmm7,xmm7,xmm2
	vpclmulqdq	xmm2,xmm8,xmm3,0x00
	vaesenc	xmm11,xmm11,xmm15
	movbe	r12,QWORD[32+r14]
	vpclmulqdq	xmm8,xmm8,xmm3,0x11
	vaesenc	xmm12,xmm12,xmm15
	mov	QWORD[((80+8))+rsp],r13
	vaesenc	xmm13,xmm13,xmm15
	mov	QWORD[((88+8))+rsp],r12
	vpxor	xmm6,xmm6,xmm5
	vaesenc	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm6,xmm1

	vmovups	xmm15,XMMWORD[((112-128))+rcx]
	vpslldq	xmm5,xmm6,8
	vpxor	xmm4,xmm4,xmm2
	vmovdqu	xmm3,XMMWORD[16+r11]

	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm7,xmm7,xmm8
	vaesenc	xmm10,xmm10,xmm15
	vpxor	xmm4,xmm4,xmm5
	movbe	r13,QWORD[24+r14]
	vaesenc	xmm11,xmm11,xmm15
	movbe	r12,QWORD[16+r14]
	vpalignr	xmm0,xmm4,xmm4,8
	vpclmulqdq	xmm4,xmm4,xmm3,0x10
	mov	QWORD[((96+8))+rsp],r13
	vaesenc	xmm12,xmm12,xmm15
	mov	QWORD[((104+8))+rsp],r12
	vaesenc	xmm13,xmm13,xmm15
	vmovups	xmm1,XMMWORD[((128-128))+rcx]
	vaesenc	xmm14,xmm14,xmm15

	vaesenc	xmm9,xmm9,xmm1
	vmovups	xmm15,XMMWORD[((144-128))+rcx]
	vaesenc	xmm10,xmm10,xmm1
	vpsrldq	xmm6,xmm6,8
	vaesenc	xmm11,xmm11,xmm1
	vpxor	xmm7,xmm7,xmm6
	vaesenc	xmm12,xmm12,xmm1
	vpxor	xmm4,xmm4,xmm0
	movbe	r13,QWORD[8+r14]
	vaesenc	xmm13,xmm13,xmm1
	movbe	r12,QWORD[r14]
	vaesenc	xmm14,xmm14,xmm1
	vmovups	xmm1,XMMWORD[((160-128))+rcx]
	cmp	ebp,11
	jb	NEAR $L$enc_tail

	vaesenc	xmm9,xmm9,xmm15
	vaesenc	xmm10,xmm10,xmm15
	vaesenc	xmm11,xmm11,xmm15
	vaesenc	xmm12,xmm12,xmm15
	vaesenc	xmm13,xmm13,xmm15
	vaesenc	xmm14,xmm14,xmm15

	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1
	vmovups	xmm15,XMMWORD[((176-128))+rcx]
	vaesenc	xmm14,xmm14,xmm1
	vmovups	xmm1,XMMWORD[((192-128))+rcx]
	je	NEAR $L$enc_tail

	vaesenc	xmm9,xmm9,xmm15
	vaesenc	xmm10,xmm10,xmm15
	vaesenc	xmm11,xmm11,xmm15
	vaesenc	xmm12,xmm12,xmm15
	vaesenc	xmm13,xmm13,xmm15
	vaesenc	xmm14,xmm14,xmm15

	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1
	vmovups	xmm15,XMMWORD[((208-128))+rcx]
	vaesenc	xmm14,xmm14,xmm1
	vmovups	xmm1,XMMWORD[((224-128))+rcx]
	jmp	NEAR $L$enc_tail

ALIGN	32
$L$handle_ctr32:
	vmovdqu	xmm0,XMMWORD[r11]
	vpshufb	xmm6,xmm1,xmm0
	vmovdqu	xmm5,XMMWORD[48+r11]
	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
	vpaddd	xmm11,xmm6,xmm5
	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
	vpaddd	xmm12,xmm10,xmm5
	vpshufb	xmm10,xmm10,xmm0
	vpaddd	xmm13,xmm11,xmm5
	vpshufb	xmm11,xmm11,xmm0
	vpxor	xmm10,xmm10,xmm15
	vpaddd	xmm14,xmm12,xmm5
	vpshufb	xmm12,xmm12,xmm0
	vpxor	xmm11,xmm11,xmm15
	vpaddd	xmm1,xmm13,xmm5
	vpshufb	xmm13,xmm13,xmm0
	vpshufb	xmm14,xmm14,xmm0
	vpshufb	xmm1,xmm1,xmm0
	jmp	NEAR $L$resume_ctr32

ALIGN	32
$L$enc_tail:
	vaesenc	xmm9,xmm9,xmm15
	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
	vpalignr	xmm8,xmm4,xmm4,8
	vaesenc	xmm10,xmm10,xmm15
	vpclmulqdq	xmm4,xmm4,xmm3,0x10
	vpxor	xmm2,xmm1,XMMWORD[rdi]
	vaesenc	xmm11,xmm11,xmm15
	vpxor	xmm0,xmm1,XMMWORD[16+rdi]
	vaesenc	xmm12,xmm12,xmm15
	vpxor	xmm5,xmm1,XMMWORD[32+rdi]
	vaesenc	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm1,XMMWORD[48+rdi]
	vaesenc	xmm14,xmm14,xmm15
	vpxor	xmm7,xmm1,XMMWORD[64+rdi]
	vpxor	xmm3,xmm1,XMMWORD[80+rdi]
	vmovdqu	xmm1,XMMWORD[r8]

	vaesenclast	xmm9,xmm9,xmm2
	vmovdqu	xmm2,XMMWORD[32+r11]
	vaesenclast	xmm10,xmm10,xmm0
	vpaddb	xmm0,xmm1,xmm2
	mov	QWORD[((112+8))+rsp],r13
	lea	rdi,[96+rdi]
	vaesenclast	xmm11,xmm11,xmm5
	vpaddb	xmm5,xmm0,xmm2
	mov	QWORD[((120+8))+rsp],r12
	lea	rsi,[96+rsi]
	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
	vaesenclast	xmm12,xmm12,xmm6
	vpaddb	xmm6,xmm5,xmm2
	vaesenclast	xmm13,xmm13,xmm7
	vpaddb	xmm7,xmm6,xmm2
	vaesenclast	xmm14,xmm14,xmm3
	vpaddb	xmm3,xmm7,xmm2

	add	r10,0x60
	sub	rdx,0x6
	jc	NEAR $L$6x_done

	vmovups	XMMWORD[(-96)+rsi],xmm9
	vpxor	xmm9,xmm1,xmm15
	vmovups	XMMWORD[(-80)+rsi],xmm10
	vmovdqa	xmm10,xmm0
	vmovups	XMMWORD[(-64)+rsi],xmm11
	vmovdqa	xmm11,xmm5
	vmovups	XMMWORD[(-48)+rsi],xmm12
	vmovdqa	xmm12,xmm6
	vmovups	XMMWORD[(-32)+rsi],xmm13
	vmovdqa	xmm13,xmm7
	vmovups	XMMWORD[(-16)+rsi],xmm14
	vmovdqa	xmm14,xmm3
	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
	jmp	NEAR $L$oop6x

$L$6x_done:
	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
	vpxor	xmm8,xmm8,xmm4

	DB	0F3h,0C3h		;repret

global	aesni_gcm_decrypt

ALIGN	32
aesni_gcm_decrypt:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_gcm_decrypt:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


	xor	r10,r10



	cmp	rdx,0x60
	jb	NEAR $L$gcm_dec_abort

	lea	rax,[rsp]
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[(-216)+rax],xmm6
	movaps	XMMWORD[(-200)+rax],xmm7
	movaps	XMMWORD[(-184)+rax],xmm8
	movaps	XMMWORD[(-168)+rax],xmm9
	movaps	XMMWORD[(-152)+rax],xmm10
	movaps	XMMWORD[(-136)+rax],xmm11
	movaps	XMMWORD[(-120)+rax],xmm12
	movaps	XMMWORD[(-104)+rax],xmm13
	movaps	XMMWORD[(-88)+rax],xmm14
	movaps	XMMWORD[(-72)+rax],xmm15
$L$gcm_dec_body:
	vzeroupper

	vmovdqu	xmm1,XMMWORD[r8]
	add	rsp,-128
	mov	ebx,DWORD[12+r8]
	lea	r11,[$L$bswap_mask]
	lea	r14,[((-128))+rcx]
	mov	r15,0xf80
	vmovdqu	xmm8,XMMWORD[r9]
	and	rsp,-128
	vmovdqu	xmm0,XMMWORD[r11]
	lea	rcx,[128+rcx]
	lea	r9,[((32+32))+r9]
	mov	ebp,DWORD[((240-128))+rcx]
	vpshufb	xmm8,xmm8,xmm0

	and	r14,r15
	and	r15,rsp
	sub	r15,r14
	jc	NEAR $L$dec_no_key_aliasing
	cmp	r15,768
	jnc	NEAR $L$dec_no_key_aliasing
	sub	rsp,r15
$L$dec_no_key_aliasing:

	vmovdqu	xmm7,XMMWORD[80+rdi]
	lea	r14,[rdi]
	vmovdqu	xmm4,XMMWORD[64+rdi]







	lea	r15,[((-192))+rdx*1+rdi]

	vmovdqu	xmm5,XMMWORD[48+rdi]
	shr	rdx,4
	xor	r10,r10
	vmovdqu	xmm6,XMMWORD[32+rdi]
	vpshufb	xmm7,xmm7,xmm0
	vmovdqu	xmm2,XMMWORD[16+rdi]
	vpshufb	xmm4,xmm4,xmm0
	vmovdqu	xmm3,XMMWORD[rdi]
	vpshufb	xmm5,xmm5,xmm0
	vmovdqu	XMMWORD[48+rsp],xmm4
	vpshufb	xmm6,xmm6,xmm0
	vmovdqu	XMMWORD[64+rsp],xmm5
	vpshufb	xmm2,xmm2,xmm0
	vmovdqu	XMMWORD[80+rsp],xmm6
	vpshufb	xmm3,xmm3,xmm0
	vmovdqu	XMMWORD[96+rsp],xmm2
	vmovdqu	XMMWORD[112+rsp],xmm3

	call	_aesni_ctr32_ghash_6x

	vmovups	XMMWORD[(-96)+rsi],xmm9
	vmovups	XMMWORD[(-80)+rsi],xmm10
	vmovups	XMMWORD[(-64)+rsi],xmm11
	vmovups	XMMWORD[(-48)+rsi],xmm12
	vmovups	XMMWORD[(-32)+rsi],xmm13
	vmovups	XMMWORD[(-16)+rsi],xmm14

	vpshufb	xmm8,xmm8,XMMWORD[r11]
	vmovdqu	XMMWORD[(-64)+r9],xmm8

	vzeroupper
	movaps	xmm6,XMMWORD[((-216))+rax]
	movaps	xmm7,XMMWORD[((-200))+rax]
	movaps	xmm8,XMMWORD[((-184))+rax]
	movaps	xmm9,XMMWORD[((-168))+rax]
	movaps	xmm10,XMMWORD[((-152))+rax]
	movaps	xmm11,XMMWORD[((-136))+rax]
	movaps	xmm12,XMMWORD[((-120))+rax]
	movaps	xmm13,XMMWORD[((-104))+rax]
	movaps	xmm14,XMMWORD[((-88))+rax]
	movaps	xmm15,XMMWORD[((-72))+rax]
	mov	r15,QWORD[((-48))+rax]
	mov	r14,QWORD[((-40))+rax]
	mov	r13,QWORD[((-32))+rax]
	mov	r12,QWORD[((-24))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	rbx,QWORD[((-8))+rax]
	lea	rsp,[rax]
$L$gcm_dec_abort:
	mov	rax,r10
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_gcm_decrypt:

ALIGN	32
_aesni_ctr32_6x:
	vmovdqu	xmm4,XMMWORD[((0-128))+rcx]
	vmovdqu	xmm2,XMMWORD[32+r11]
	lea	r13,[((-1))+rbp]
	vmovups	xmm15,XMMWORD[((16-128))+rcx]
	lea	r12,[((32-128))+rcx]
	vpxor	xmm9,xmm1,xmm4
	add	ebx,100663296
	jc	NEAR $L$handle_ctr32_2
	vpaddb	xmm10,xmm1,xmm2
	vpaddb	xmm11,xmm10,xmm2
	vpxor	xmm10,xmm10,xmm4
	vpaddb	xmm12,xmm11,xmm2
	vpxor	xmm11,xmm11,xmm4
	vpaddb	xmm13,xmm12,xmm2
	vpxor	xmm12,xmm12,xmm4
	vpaddb	xmm14,xmm13,xmm2
	vpxor	xmm13,xmm13,xmm4
	vpaddb	xmm1,xmm14,xmm2
	vpxor	xmm14,xmm14,xmm4
	jmp	NEAR $L$oop_ctr32

ALIGN	16
$L$oop_ctr32:
	vaesenc	xmm9,xmm9,xmm15
	vaesenc	xmm10,xmm10,xmm15
	vaesenc	xmm11,xmm11,xmm15
	vaesenc	xmm12,xmm12,xmm15
	vaesenc	xmm13,xmm13,xmm15
	vaesenc	xmm14,xmm14,xmm15
	vmovups	xmm15,XMMWORD[r12]
	lea	r12,[16+r12]
	dec	r13d
	jnz	NEAR $L$oop_ctr32

	vmovdqu	xmm3,XMMWORD[r12]
	vaesenc	xmm9,xmm9,xmm15
	vpxor	xmm4,xmm3,XMMWORD[rdi]
	vaesenc	xmm10,xmm10,xmm15
	vpxor	xmm5,xmm3,XMMWORD[16+rdi]
	vaesenc	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm3,XMMWORD[32+rdi]
	vaesenc	xmm12,xmm12,xmm15
	vpxor	xmm8,xmm3,XMMWORD[48+rdi]
	vaesenc	xmm13,xmm13,xmm15
	vpxor	xmm2,xmm3,XMMWORD[64+rdi]
	vaesenc	xmm14,xmm14,xmm15
	vpxor	xmm3,xmm3,XMMWORD[80+rdi]
	lea	rdi,[96+rdi]

	vaesenclast	xmm9,xmm9,xmm4
	vaesenclast	xmm10,xmm10,xmm5
	vaesenclast	xmm11,xmm11,xmm6
	vaesenclast	xmm12,xmm12,xmm8
	vaesenclast	xmm13,xmm13,xmm2
	vaesenclast	xmm14,xmm14,xmm3
	vmovups	XMMWORD[rsi],xmm9
	vmovups	XMMWORD[16+rsi],xmm10
	vmovups	XMMWORD[32+rsi],xmm11
	vmovups	XMMWORD[48+rsi],xmm12
	vmovups	XMMWORD[64+rsi],xmm13
	vmovups	XMMWORD[80+rsi],xmm14
	lea	rsi,[96+rsi]

	DB	0F3h,0C3h		;repret
ALIGN	32
$L$handle_ctr32_2:
	vpshufb	xmm6,xmm1,xmm0
	vmovdqu	xmm5,XMMWORD[48+r11]
	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
	vpaddd	xmm11,xmm6,xmm5
	vpaddd	xmm12,xmm10,xmm5
	vpshufb	xmm10,xmm10,xmm0
	vpaddd	xmm13,xmm11,xmm5
	vpshufb	xmm11,xmm11,xmm0
	vpxor	xmm10,xmm10,xmm4
	vpaddd	xmm14,xmm12,xmm5
	vpshufb	xmm12,xmm12,xmm0
	vpxor	xmm11,xmm11,xmm4
	vpaddd	xmm1,xmm13,xmm5
	vpshufb	xmm13,xmm13,xmm0
	vpxor	xmm12,xmm12,xmm4
	vpshufb	xmm14,xmm14,xmm0
	vpxor	xmm13,xmm13,xmm4
	vpshufb	xmm1,xmm1,xmm0
	vpxor	xmm14,xmm14,xmm4
	jmp	NEAR $L$oop_ctr32


global	aesni_gcm_encrypt

ALIGN	32
aesni_gcm_encrypt:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_gcm_encrypt:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


	xor	r10,r10




	cmp	rdx,0x60*3
	jb	NEAR $L$gcm_enc_abort

	lea	rax,[rsp]
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[(-216)+rax],xmm6
	movaps	XMMWORD[(-200)+rax],xmm7
	movaps	XMMWORD[(-184)+rax],xmm8
	movaps	XMMWORD[(-168)+rax],xmm9
	movaps	XMMWORD[(-152)+rax],xmm10
	movaps	XMMWORD[(-136)+rax],xmm11
	movaps	XMMWORD[(-120)+rax],xmm12
	movaps	XMMWORD[(-104)+rax],xmm13
	movaps	XMMWORD[(-88)+rax],xmm14
	movaps	XMMWORD[(-72)+rax],xmm15
$L$gcm_enc_body:
	vzeroupper

	vmovdqu	xmm1,XMMWORD[r8]
	add	rsp,-128
	mov	ebx,DWORD[12+r8]
	lea	r11,[$L$bswap_mask]
	lea	r14,[((-128))+rcx]
	mov	r15,0xf80
	lea	rcx,[128+rcx]
	vmovdqu	xmm0,XMMWORD[r11]
	and	rsp,-128
	mov	ebp,DWORD[((240-128))+rcx]

	and	r14,r15
	and	r15,rsp
	sub	r15,r14
	jc	NEAR $L$enc_no_key_aliasing
	cmp	r15,768
	jnc	NEAR $L$enc_no_key_aliasing
	sub	rsp,r15
$L$enc_no_key_aliasing:

	lea	r14,[rsi]








	lea	r15,[((-192))+rdx*1+rsi]

	shr	rdx,4

	call	_aesni_ctr32_6x
	vpshufb	xmm8,xmm9,xmm0
	vpshufb	xmm2,xmm10,xmm0
	vmovdqu	XMMWORD[112+rsp],xmm8
	vpshufb	xmm4,xmm11,xmm0
	vmovdqu	XMMWORD[96+rsp],xmm2
	vpshufb	xmm5,xmm12,xmm0
	vmovdqu	XMMWORD[80+rsp],xmm4
	vpshufb	xmm6,xmm13,xmm0
	vmovdqu	XMMWORD[64+rsp],xmm5
	vpshufb	xmm7,xmm14,xmm0
	vmovdqu	XMMWORD[48+rsp],xmm6

	call	_aesni_ctr32_6x

	vmovdqu	xmm8,XMMWORD[r9]
	lea	r9,[((32+32))+r9]
	sub	rdx,12
	mov	r10,0x60*2
	vpshufb	xmm8,xmm8,xmm0

	call	_aesni_ctr32_ghash_6x
	vmovdqu	xmm7,XMMWORD[32+rsp]
	vmovdqu	xmm0,XMMWORD[r11]
	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
	vpunpckhqdq	xmm1,xmm7,xmm7
	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
	vmovups	XMMWORD[(-96)+rsi],xmm9
	vpshufb	xmm9,xmm9,xmm0
	vpxor	xmm1,xmm1,xmm7
	vmovups	XMMWORD[(-80)+rsi],xmm10
	vpshufb	xmm10,xmm10,xmm0
	vmovups	XMMWORD[(-64)+rsi],xmm11
	vpshufb	xmm11,xmm11,xmm0
	vmovups	XMMWORD[(-48)+rsi],xmm12
	vpshufb	xmm12,xmm12,xmm0
	vmovups	XMMWORD[(-32)+rsi],xmm13
	vpshufb	xmm13,xmm13,xmm0
	vmovups	XMMWORD[(-16)+rsi],xmm14
	vpshufb	xmm14,xmm14,xmm0
	vmovdqu	XMMWORD[16+rsp],xmm9
	vmovdqu	xmm6,XMMWORD[48+rsp]
	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
	vpunpckhqdq	xmm2,xmm6,xmm6
	vpclmulqdq	xmm5,xmm7,xmm3,0x00
	vpxor	xmm2,xmm2,xmm6
	vpclmulqdq	xmm7,xmm7,xmm3,0x11
	vpclmulqdq	xmm1,xmm1,xmm15,0x00

	vmovdqu	xmm9,XMMWORD[64+rsp]
	vpclmulqdq	xmm4,xmm6,xmm0,0x00
	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
	vpxor	xmm4,xmm4,xmm5
	vpunpckhqdq	xmm5,xmm9,xmm9
	vpclmulqdq	xmm6,xmm6,xmm0,0x11
	vpxor	xmm5,xmm5,xmm9
	vpxor	xmm6,xmm6,xmm7
	vpclmulqdq	xmm2,xmm2,xmm15,0x10
	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
	vpxor	xmm2,xmm2,xmm1

	vmovdqu	xmm1,XMMWORD[80+rsp]
	vpclmulqdq	xmm7,xmm9,xmm3,0x00
	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
	vpxor	xmm7,xmm7,xmm4
	vpunpckhqdq	xmm4,xmm1,xmm1
	vpclmulqdq	xmm9,xmm9,xmm3,0x11
	vpxor	xmm4,xmm4,xmm1
	vpxor	xmm9,xmm9,xmm6
	vpclmulqdq	xmm5,xmm5,xmm15,0x00
	vpxor	xmm5,xmm5,xmm2

	vmovdqu	xmm2,XMMWORD[96+rsp]
	vpclmulqdq	xmm6,xmm1,xmm0,0x00
	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
	vpxor	xmm6,xmm6,xmm7
	vpunpckhqdq	xmm7,xmm2,xmm2
	vpclmulqdq	xmm1,xmm1,xmm0,0x11
	vpxor	xmm7,xmm7,xmm2
	vpxor	xmm1,xmm1,xmm9
	vpclmulqdq	xmm4,xmm4,xmm15,0x10
	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
	vpxor	xmm4,xmm4,xmm5

	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
	vpclmulqdq	xmm5,xmm2,xmm3,0x00
	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
	vpunpckhqdq	xmm9,xmm8,xmm8
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm2,xmm2,xmm3,0x11
	vpxor	xmm9,xmm9,xmm8
	vpxor	xmm2,xmm2,xmm1
	vpclmulqdq	xmm7,xmm7,xmm15,0x00
	vpxor	xmm4,xmm7,xmm4

	vpclmulqdq	xmm6,xmm8,xmm0,0x00
	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
	vpunpckhqdq	xmm1,xmm14,xmm14
	vpclmulqdq	xmm8,xmm8,xmm0,0x11
	vpxor	xmm1,xmm1,xmm14
	vpxor	xmm5,xmm6,xmm5
	vpclmulqdq	xmm9,xmm9,xmm15,0x10
	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
	vpxor	xmm7,xmm8,xmm2
	vpxor	xmm6,xmm9,xmm4

	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
	vpxor	xmm9,xmm7,xmm5
	vpclmulqdq	xmm4,xmm14,xmm3,0x00
	vpxor	xmm6,xmm6,xmm9
	vpunpckhqdq	xmm2,xmm13,xmm13
	vpclmulqdq	xmm14,xmm14,xmm3,0x11
	vpxor	xmm2,xmm2,xmm13
	vpslldq	xmm9,xmm6,8
	vpclmulqdq	xmm1,xmm1,xmm15,0x00
	vpxor	xmm8,xmm5,xmm9
	vpsrldq	xmm6,xmm6,8
	vpxor	xmm7,xmm7,xmm6

	vpclmulqdq	xmm5,xmm13,xmm0,0x00
	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
	vpxor	xmm5,xmm5,xmm4
	vpunpckhqdq	xmm9,xmm12,xmm12
	vpclmulqdq	xmm13,xmm13,xmm0,0x11
	vpxor	xmm9,xmm9,xmm12
	vpxor	xmm13,xmm13,xmm14
	vpalignr	xmm14,xmm8,xmm8,8
	vpclmulqdq	xmm2,xmm2,xmm15,0x10
	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
	vpxor	xmm2,xmm2,xmm1

	vpclmulqdq	xmm4,xmm12,xmm3,0x00
	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
	vpxor	xmm4,xmm4,xmm5
	vpunpckhqdq	xmm1,xmm11,xmm11
	vpclmulqdq	xmm12,xmm12,xmm3,0x11
	vpxor	xmm1,xmm1,xmm11
	vpxor	xmm12,xmm12,xmm13
	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
	vpclmulqdq	xmm9,xmm9,xmm15,0x00
	vpxor	xmm9,xmm9,xmm2

	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
	vxorps	xmm8,xmm8,xmm14

	vpclmulqdq	xmm5,xmm11,xmm0,0x00
	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
	vpxor	xmm5,xmm5,xmm4
	vpunpckhqdq	xmm2,xmm10,xmm10
	vpclmulqdq	xmm11,xmm11,xmm0,0x11
	vpxor	xmm2,xmm2,xmm10
	vpalignr	xmm14,xmm8,xmm8,8
	vpxor	xmm11,xmm11,xmm12
	vpclmulqdq	xmm1,xmm1,xmm15,0x10
	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
	vpxor	xmm1,xmm1,xmm9

	vxorps	xmm14,xmm14,xmm7
	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
	vxorps	xmm8,xmm8,xmm14

	vpclmulqdq	xmm4,xmm10,xmm3,0x00
	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
	vpxor	xmm4,xmm4,xmm5
	vpunpckhqdq	xmm9,xmm8,xmm8
	vpclmulqdq	xmm10,xmm10,xmm3,0x11
	vpxor	xmm9,xmm9,xmm8
	vpxor	xmm10,xmm10,xmm11
	vpclmulqdq	xmm2,xmm2,xmm15,0x00
	vpxor	xmm2,xmm2,xmm1

	vpclmulqdq	xmm5,xmm8,xmm0,0x00
	vpclmulqdq	xmm7,xmm8,xmm0,0x11
	vpxor	xmm5,xmm5,xmm4
	vpclmulqdq	xmm6,xmm9,xmm15,0x10
	vpxor	xmm7,xmm7,xmm10
	vpxor	xmm6,xmm6,xmm2

	vpxor	xmm4,xmm7,xmm5
	vpxor	xmm6,xmm6,xmm4
	vpslldq	xmm1,xmm6,8
	vmovdqu	xmm3,XMMWORD[16+r11]
	vpsrldq	xmm6,xmm6,8
	vpxor	xmm8,xmm5,xmm1
	vpxor	xmm7,xmm7,xmm6

	vpalignr	xmm2,xmm8,xmm8,8
	vpclmulqdq	xmm8,xmm8,xmm3,0x10
	vpxor	xmm8,xmm8,xmm2

	vpalignr	xmm2,xmm8,xmm8,8
	vpclmulqdq	xmm8,xmm8,xmm3,0x10
	vpxor	xmm2,xmm2,xmm7
	vpxor	xmm8,xmm8,xmm2
	vpshufb	xmm8,xmm8,XMMWORD[r11]
	vmovdqu	XMMWORD[(-64)+r9],xmm8

	vzeroupper
	movaps	xmm6,XMMWORD[((-216))+rax]
	movaps	xmm7,XMMWORD[((-200))+rax]
	movaps	xmm8,XMMWORD[((-184))+rax]
	movaps	xmm9,XMMWORD[((-168))+rax]
	movaps	xmm10,XMMWORD[((-152))+rax]
	movaps	xmm11,XMMWORD[((-136))+rax]
	movaps	xmm12,XMMWORD[((-120))+rax]
	movaps	xmm13,XMMWORD[((-104))+rax]
	movaps	xmm14,XMMWORD[((-88))+rax]
	movaps	xmm15,XMMWORD[((-72))+rax]
	mov	r15,QWORD[((-48))+rax]
	mov	r14,QWORD[((-40))+rax]
	mov	r13,QWORD[((-32))+rax]
	mov	r12,QWORD[((-24))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	rbx,QWORD[((-8))+rax]
	lea	rsp,[rax]
$L$gcm_enc_abort:
	mov	rax,r10
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_gcm_encrypt:
ALIGN	64
$L$bswap_mask:
DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
$L$poly:
DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
$L$one_msb:
DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
$L$two_lsb:
DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
$L$one_lsb:
DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
ALIGN	64
EXTERN	__imp_RtlVirtualUnwind

ALIGN	16
gcm_se_handler:
	push	rsi
	push	rdi
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	pushfq
	sub	rsp,64

	mov	rax,QWORD[120+r8]
	mov	rbx,QWORD[248+r8]

	mov	rsi,QWORD[8+r9]
	mov	r11,QWORD[56+r9]

	mov	r10d,DWORD[r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jb	NEAR $L$common_seh_tail

	mov	rax,QWORD[152+r8]

	mov	r10d,DWORD[4+r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jae	NEAR $L$common_seh_tail

	mov	rax,QWORD[120+r8]

	mov	r15,QWORD[((-48))+rax]
	mov	r14,QWORD[((-40))+rax]
	mov	r13,QWORD[((-32))+rax]
	mov	r12,QWORD[((-24))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	rbx,QWORD[((-8))+rax]
	mov	QWORD[240+r8],r15
	mov	QWORD[232+r8],r14
	mov	QWORD[224+r8],r13
	mov	QWORD[216+r8],r12
	mov	QWORD[160+r8],rbp
	mov	QWORD[144+r8],rbx

	lea	rsi,[((-216))+rax]
	lea	rdi,[512+r8]
	mov	ecx,20
	DD	0xa548f3fc

$L$common_seh_tail:
	mov	rdi,QWORD[8+rax]
	mov	rsi,QWORD[16+rax]
	mov	QWORD[152+r8],rax
	mov	QWORD[168+r8],rsi
	mov	QWORD[176+r8],rdi

	mov	rdi,QWORD[40+r9]
	mov	rsi,r8
	mov	ecx,154
	DD	0xa548f3fc

	mov	rsi,r9
	xor	rcx,rcx
	mov	rdx,QWORD[8+rsi]
	mov	r8,QWORD[rsi]
	mov	r9,QWORD[16+rsi]
	mov	r10,QWORD[40+rsi]
	lea	r11,[56+rsi]
	lea	r12,[24+rsi]
	mov	QWORD[32+rsp],r10
	mov	QWORD[40+rsp],r11
	mov	QWORD[48+rsp],r12
	mov	QWORD[56+rsp],rcx
	call	QWORD[__imp_RtlVirtualUnwind]

	mov	eax,1
	add	rsp,64
	popfq
	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	rbp
	pop	rbx
	pop	rdi
	pop	rsi
	DB	0F3h,0C3h		;repret


section	.pdata rdata align=4
ALIGN	4
	DD	$L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase
	DD	$L$SEH_end_aesni_gcm_decrypt wrt ..imagebase
	DD	$L$SEH_gcm_dec_info wrt ..imagebase

	DD	$L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase
	DD	$L$SEH_end_aesni_gcm_encrypt wrt ..imagebase
	DD	$L$SEH_gcm_enc_info wrt ..imagebase
section	.xdata rdata align=8
ALIGN	8
$L$SEH_gcm_dec_info:
DB	9,0,0,0
	DD	gcm_se_handler wrt ..imagebase
	DD	$L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase
$L$SEH_gcm_enc_info:
DB	9,0,0,0
	DD	gcm_se_handler wrt ..imagebase
	DD	$L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase