#if defined(__x86_64__)
.text	


.p2align	4
MULADD_128x512:
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	movq	%r8,0(%rcx)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%r8
	movq	8(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	movq	%r9,8(%rcx)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%r9
	.byte	0xf3,0xc3


.p2align	4
mont_reduce:
	leaq	192(%rsp),%rdi
	movq	32(%rsp),%rsi
	addq	$576,%rsi
	leaq	520(%rsp),%rcx

	movq	96(%rcx),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	movq	(%rcx),%r8
	addq	%rax,%r8
	adcq	$0,%rdx
	movq	%r8,0(%rdi)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	movq	8(%rcx),%r9
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	movq	16(%rcx),%r10
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	movq	24(%rcx),%r11
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	movq	32(%rcx),%r12
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	movq	40(%rcx),%r13
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	movq	48(%rcx),%r14
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	movq	56(%rcx),%r15
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%r8
	movq	104(%rcx),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	movq	%r9,8(%rdi)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%r9
	movq	112(%rcx),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	movq	%r10,16(%rdi)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%r10
	movq	120(%rcx),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%r11,24(%rdi)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%r11
	xorq	%rax,%rax

	addq	64(%rcx),%r8
	adcq	72(%rcx),%r9
	adcq	80(%rcx),%r10
	adcq	88(%rcx),%r11
	adcq	$0,%rax




	movq	%r8,64(%rdi)
	movq	%r9,72(%rdi)
	movq	%r10,%rbp
	movq	%r11,88(%rdi)

	movq	%rax,384(%rsp)

	movq	0(%rdi),%r8
	movq	8(%rdi),%r9
	movq	16(%rdi),%r10
	movq	24(%rdi),%r11








	addq	$80,%rdi

	addq	$64,%rsi
	leaq	296(%rsp),%rcx

	call	MULADD_128x512

	movq	384(%rsp),%rax


	addq	-16(%rdi),%r8
	adcq	-8(%rdi),%r9
	movq	%r8,64(%rcx)
	movq	%r9,72(%rcx)

	adcq	%rax,%rax
	movq	%rax,384(%rsp)

	leaq	192(%rsp),%rdi
	addq	$64,%rsi





	movq	(%rsi),%r8
	movq	8(%rsi),%rbx

	movq	(%rcx),%rax
	mulq	%r8
	movq	%rax,%rbp
	movq	%rdx,%r9

	movq	8(%rcx),%rax
	mulq	%r8
	addq	%rax,%r9

	movq	(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r9

	movq	%r9,8(%rdi)


	subq	$192,%rsi

	movq	(%rcx),%r8
	movq	8(%rcx),%r9

	call	MULADD_128x512




	movq	0(%rsi),%rax
	movq	8(%rsi),%rbx
	movq	16(%rsi),%rdi
	movq	24(%rsi),%rdx


	movq	384(%rsp),%rbp

	addq	64(%rcx),%r8
	adcq	72(%rcx),%r9


	adcq	%rbp,%rbp



	shlq	$3,%rbp
	movq	32(%rsp),%rcx
	addq	%rcx,%rbp


	xorq	%rsi,%rsi

	addq	0(%rbp),%r10
	adcq	64(%rbp),%r11
	adcq	128(%rbp),%r12
	adcq	192(%rbp),%r13
	adcq	256(%rbp),%r14
	adcq	320(%rbp),%r15
	adcq	384(%rbp),%r8
	adcq	448(%rbp),%r9



	sbbq	$0,%rsi


	andq	%rsi,%rax
	andq	%rsi,%rbx
	andq	%rsi,%rdi
	andq	%rsi,%rdx

	movq	$1,%rbp
	subq	%rax,%r10
	sbbq	%rbx,%r11
	sbbq	%rdi,%r12
	sbbq	%rdx,%r13




	sbbq	$0,%rbp



	addq	$512,%rcx
	movq	32(%rcx),%rax
	movq	40(%rcx),%rbx
	movq	48(%rcx),%rdi
	movq	56(%rcx),%rdx



	andq	%rsi,%rax
	andq	%rsi,%rbx
	andq	%rsi,%rdi
	andq	%rsi,%rdx



	subq	$1,%rbp

	sbbq	%rax,%r14
	sbbq	%rbx,%r15
	sbbq	%rdi,%r8
	sbbq	%rdx,%r9



	movq	144(%rsp),%rsi
	movq	%r10,0(%rsi)
	movq	%r11,8(%rsi)
	movq	%r12,16(%rsi)
	movq	%r13,24(%rsi)
	movq	%r14,32(%rsi)
	movq	%r15,40(%rsi)
	movq	%r8,48(%rsi)
	movq	%r9,56(%rsi)

	.byte	0xf3,0xc3


.p2align	4
mont_mul_a3b:




	movq	0(%rdi),%rbp

	movq	%r10,%rax
	mulq	%rbp
	movq	%rax,520(%rsp)
	movq	%rdx,%r10
	movq	%r11,%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	movq	%rdx,%r11
	movq	%r12,%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%rdx,%r12
	movq	%r13,%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	movq	%rdx,%r13
	movq	%r14,%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	movq	%rdx,%r14
	movq	%r15,%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	movq	%rdx,%r15
	movq	%r8,%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	movq	%rdx,%r8
	movq	%r9,%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	movq	%rdx,%r9
	movq	8(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	movq	%r10,528(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%r10
	movq	16(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%r11,536(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%r11
	movq	24(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	movq	%r12,544(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%r12
	movq	32(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	movq	%r13,552(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%r13
	movq	40(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	movq	%r14,560(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%r14
	movq	48(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	movq	%r15,568(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	addq	%rbx,%r8
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%r15
	movq	56(%rdi),%rbp
	movq	0(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r8
	adcq	$0,%rdx
	movq	%r8,576(%rsp)
	movq	%rdx,%rbx

	movq	8(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r9
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	adcq	$0,%rdx
	addq	%rbx,%r10
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	24(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	32(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%rbx,%r12
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	40(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%rbx,%r13
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	48(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%rbx,%r14
	adcq	$0,%rdx
	movq	%rdx,%rbx

	movq	56(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx
	movq	%rdx,%r8
	movq	%r9,584(%rsp)
	movq	%r10,592(%rsp)
	movq	%r11,600(%rsp)
	movq	%r12,608(%rsp)
	movq	%r13,616(%rsp)
	movq	%r14,624(%rsp)
	movq	%r15,632(%rsp)
	movq	%r8,640(%rsp)





	jmp	mont_reduce




.p2align	4
sqr_reduce:
	movq	16(%rsp),%rcx



	movq	%r10,%rbx

	movq	%r11,%rax
	mulq	%rbx
	movq	%rax,528(%rsp)
	movq	%rdx,%r10
	movq	%r12,%rax
	mulq	%rbx
	addq	%rax,%r10
	adcq	$0,%rdx
	movq	%rdx,%r11
	movq	%r13,%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%rdx,%r12
	movq	%r14,%rax
	mulq	%rbx
	addq	%rax,%r12
	adcq	$0,%rdx
	movq	%rdx,%r13
	movq	%r15,%rax
	mulq	%rbx
	addq	%rax,%r13
	adcq	$0,%rdx
	movq	%rdx,%r14
	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%r14
	adcq	$0,%rdx
	movq	%rdx,%r15
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	movq	%rdx,%rsi

	movq	%r10,536(%rsp)





	movq	8(%rcx),%rbx

	movq	16(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%r11,544(%rsp)

	movq	%rdx,%r10
	movq	24(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%r10,%r12
	adcq	$0,%rdx
	movq	%r12,552(%rsp)

	movq	%rdx,%r10
	movq	32(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r13
	adcq	$0,%rdx
	addq	%r10,%r13
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	40(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%r10,%r14
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%r10,%r15
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%rsi
	adcq	$0,%rdx
	addq	%r10,%rsi
	adcq	$0,%rdx

	movq	%rdx,%r11




	movq	16(%rcx),%rbx

	movq	24(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r13
	adcq	$0,%rdx
	movq	%r13,560(%rsp)

	movq	%rdx,%r10
	movq	32(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r14
	adcq	$0,%rdx
	addq	%r10,%r14
	adcq	$0,%rdx
	movq	%r14,568(%rsp)

	movq	%rdx,%r10
	movq	40(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%r10,%r15
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%rsi
	adcq	$0,%rdx
	addq	%r10,%rsi
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%r10,%r11
	adcq	$0,%rdx

	movq	%rdx,%r12





	movq	24(%rcx),%rbx

	movq	32(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	movq	%r15,576(%rsp)

	movq	%rdx,%r10
	movq	40(%rcx),%rax
	mulq	%rbx
	addq	%rax,%rsi
	adcq	$0,%rdx
	addq	%r10,%rsi
	adcq	$0,%rdx
	movq	%rsi,584(%rsp)

	movq	%rdx,%r10
	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%r10,%r11
	adcq	$0,%rdx

	movq	%rdx,%r10
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%r10,%r12
	adcq	$0,%rdx

	movq	%rdx,%r15




	movq	32(%rcx),%rbx

	movq	40(%rcx),%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	movq	%r11,592(%rsp)

	movq	%rdx,%r10
	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%r12
	adcq	$0,%rdx
	addq	%r10,%r12
	adcq	$0,%rdx
	movq	%r12,600(%rsp)

	movq	%rdx,%r10
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	addq	%r10,%r15
	adcq	$0,%rdx

	movq	%rdx,%r11




	movq	40(%rcx),%rbx

	movq	%r8,%rax
	mulq	%rbx
	addq	%rax,%r15
	adcq	$0,%rdx
	movq	%r15,608(%rsp)

	movq	%rdx,%r10
	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r11
	adcq	$0,%rdx
	addq	%r10,%r11
	adcq	$0,%rdx
	movq	%r11,616(%rsp)

	movq	%rdx,%r12




	movq	%r8,%rbx

	movq	%r9,%rax
	mulq	%rbx
	addq	%rax,%r12
	adcq	$0,%rdx
	movq	%r12,624(%rsp)

	movq	%rdx,632(%rsp)


	movq	528(%rsp),%r10
	movq	536(%rsp),%r11
	movq	544(%rsp),%r12
	movq	552(%rsp),%r13
	movq	560(%rsp),%r14
	movq	568(%rsp),%r15

	movq	24(%rcx),%rax
	mulq	%rax
	movq	%rax,%rdi
	movq	%rdx,%r8

	addq	%r10,%r10
	adcq	%r11,%r11
	adcq	%r12,%r12
	adcq	%r13,%r13
	adcq	%r14,%r14
	adcq	%r15,%r15
	adcq	$0,%r8

	movq	0(%rcx),%rax
	mulq	%rax
	movq	%rax,520(%rsp)
	movq	%rdx,%rbx

	movq	8(%rcx),%rax
	mulq	%rax

	addq	%rbx,%r10
	adcq	%rax,%r11
	adcq	$0,%rdx

	movq	%rdx,%rbx
	movq	%r10,528(%rsp)
	movq	%r11,536(%rsp)

	movq	16(%rcx),%rax
	mulq	%rax

	addq	%rbx,%r12
	adcq	%rax,%r13
	adcq	$0,%rdx

	movq	%rdx,%rbx

	movq	%r12,544(%rsp)
	movq	%r13,552(%rsp)

	xorq	%rbp,%rbp
	addq	%rbx,%r14
	adcq	%rdi,%r15
	adcq	$0,%rbp

	movq	%r14,560(%rsp)
	movq	%r15,568(%rsp)




	movq	576(%rsp),%r10
	movq	584(%rsp),%r11
	movq	592(%rsp),%r12
	movq	600(%rsp),%r13
	movq	608(%rsp),%r14
	movq	616(%rsp),%r15
	movq	624(%rsp),%rdi
	movq	632(%rsp),%rsi

	movq	%r9,%rax
	mulq	%rax
	movq	%rax,%r9
	movq	%rdx,%rbx

	addq	%r10,%r10
	adcq	%r11,%r11
	adcq	%r12,%r12
	adcq	%r13,%r13
	adcq	%r14,%r14
	adcq	%r15,%r15
	adcq	%rdi,%rdi
	adcq	%rsi,%rsi
	adcq	$0,%rbx

	addq	%rbp,%r10

	movq	32(%rcx),%rax
	mulq	%rax

	addq	%r8,%r10
	adcq	%rax,%r11
	adcq	$0,%rdx

	movq	%rdx,%rbp

	movq	%r10,576(%rsp)
	movq	%r11,584(%rsp)

	movq	40(%rcx),%rax
	mulq	%rax

	addq	%rbp,%r12
	adcq	%rax,%r13
	adcq	$0,%rdx

	movq	%rdx,%rbp

	movq	%r12,592(%rsp)
	movq	%r13,600(%rsp)

	movq	48(%rcx),%rax
	mulq	%rax

	addq	%rbp,%r14
	adcq	%rax,%r15
	adcq	$0,%rdx

	movq	%r14,608(%rsp)
	movq	%r15,616(%rsp)

	addq	%rdx,%rdi
	adcq	%r9,%rsi
	adcq	$0,%rbx

	movq	%rdi,624(%rsp)
	movq	%rsi,632(%rsp)
	movq	%rbx,640(%rsp)

	jmp	mont_reduce



.globl	_mod_exp_512
.private_extern _mod_exp_512

_mod_exp_512:
	pushq	%rbp
	pushq	%rbx
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15


	movq	%rsp,%r8
	subq	$2688,%rsp
	andq	$-64,%rsp


	movq	%r8,0(%rsp)
	movq	%rdi,8(%rsp)
	movq	%rsi,16(%rsp)
	movq	%rcx,24(%rsp)
L$body:



	pxor	%xmm4,%xmm4
	movdqu	0(%rsi),%xmm0
	movdqu	16(%rsi),%xmm1
	movdqu	32(%rsi),%xmm2
	movdqu	48(%rsi),%xmm3
	movdqa	%xmm4,512(%rsp)
	movdqa	%xmm4,528(%rsp)
	movdqa	%xmm4,608(%rsp)
	movdqa	%xmm4,624(%rsp)
	movdqa	%xmm0,544(%rsp)
	movdqa	%xmm1,560(%rsp)
	movdqa	%xmm2,576(%rsp)
	movdqa	%xmm3,592(%rsp)


	movdqu	0(%rdx),%xmm0
	movdqu	16(%rdx),%xmm1
	movdqu	32(%rdx),%xmm2
	movdqu	48(%rdx),%xmm3

	leaq	384(%rsp),%rbx
	movq	%rbx,136(%rsp)
	call	mont_reduce


	leaq	448(%rsp),%rcx
	xorq	%rax,%rax
	movq	%rax,0(%rcx)
	movq	%rax,8(%rcx)
	movq	%rax,24(%rcx)
	movq	%rax,32(%rcx)
	movq	%rax,40(%rcx)
	movq	%rax,48(%rcx)
	movq	%rax,56(%rcx)
	movq	%rax,128(%rsp)
	movq	$1,16(%rcx)

	leaq	640(%rsp),%rbp
	movq	%rcx,%rsi
	movq	%rbp,%rdi
	movq	$8,%rax
loop_0:
	movq	(%rcx),%rbx
	movw	%bx,(%rdi)
	shrq	$16,%rbx
	movw	%bx,64(%rdi)
	shrq	$16,%rbx
	movw	%bx,128(%rdi)
	shrq	$16,%rbx
	movw	%bx,192(%rdi)
	leaq	8(%rcx),%rcx
	leaq	256(%rdi),%rdi
	decq	%rax
	jnz	loop_0
	movq	$31,%rax
	movq	%rax,32(%rsp)
	movq	%rbp,40(%rsp)

	movq	%rsi,136(%rsp)
	movq	0(%rsi),%r10
	movq	8(%rsi),%r11
	movq	16(%rsi),%r12
	movq	24(%rsi),%r13
	movq	32(%rsi),%r14
	movq	40(%rsi),%r15
	movq	48(%rsi),%r8
	movq	56(%rsi),%r9
init_loop:
	leaq	384(%rsp),%rdi
	call	mont_mul_a3b
	leaq	448(%rsp),%rsi
	movq	40(%rsp),%rbp
	addq	$2,%rbp
	movq	%rbp,40(%rsp)
	movq	%rsi,%rcx
	movq	$8,%rax
loop_1:
	movq	(%rcx),%rbx
	movw	%bx,(%rbp)
	shrq	$16,%rbx
	movw	%bx,64(%rbp)
	shrq	$16,%rbx
	movw	%bx,128(%rbp)
	shrq	$16,%rbx
	movw	%bx,192(%rbp)
	leaq	8(%rcx),%rcx
	leaq	256(%rbp),%rbp
	decq	%rax
	jnz	loop_1
	movq	32(%rsp),%rax
	subq	$1,%rax
	movq	%rax,32(%rsp)
	jne	init_loop



	movdqa	%xmm0,64(%rsp)
	movdqa	%xmm1,80(%rsp)
	movdqa	%xmm2,96(%rsp)
	movdqa	%xmm3,112(%rsp)





	movl	126(%rsp),%eax
	movq	%rax,%rdx
	shrq	$11,%rax
	andl	$2047,%edx
	movl	%edx,126(%rsp)
	leaq	640(%rsp,%rax,2),%rsi
	movq	8(%rsp),%rdx
	movq	$4,%rbp
loop_2:
	movzwq	192(%rsi),%rbx
	movzwq	448(%rsi),%rax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	128(%rsi),%bx
	movw	384(%rsi),%ax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	64(%rsi),%bx
	movw	320(%rsi),%ax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	0(%rsi),%bx
	movw	256(%rsi),%ax
	movq	%rbx,0(%rdx)
	movq	%rax,8(%rdx)
	leaq	512(%rsi),%rsi
	leaq	16(%rdx),%rdx
	subq	$1,%rbp
	jnz	loop_2
	movq	$505,48(%rsp)

	movq	8(%rsp),%rcx
	movq	%rcx,136(%rsp)
	movq	0(%rcx),%r10
	movq	8(%rcx),%r11
	movq	16(%rcx),%r12
	movq	24(%rcx),%r13
	movq	32(%rcx),%r14
	movq	40(%rcx),%r15
	movq	48(%rcx),%r8
	movq	56(%rcx),%r9
	jmp	sqr_2

main_loop_a3b:
	call	sqr_reduce
	call	sqr_reduce
	call	sqr_reduce
sqr_2:
	call	sqr_reduce
	call	sqr_reduce



	movq	48(%rsp),%rcx
	movq	%rcx,%rax
	shrq	$4,%rax
	movl	64(%rsp,%rax,2),%edx
	andq	$15,%rcx
	shrq	%cl,%rdx
	andq	$31,%rdx

	leaq	640(%rsp,%rdx,2),%rsi
	leaq	448(%rsp),%rdx
	movq	%rdx,%rdi
	movq	$4,%rbp
loop_3:
	movzwq	192(%rsi),%rbx
	movzwq	448(%rsi),%rax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	128(%rsi),%bx
	movw	384(%rsi),%ax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	64(%rsi),%bx
	movw	320(%rsi),%ax
	shlq	$16,%rbx
	shlq	$16,%rax
	movw	0(%rsi),%bx
	movw	256(%rsi),%ax
	movq	%rbx,0(%rdx)
	movq	%rax,8(%rdx)
	leaq	512(%rsi),%rsi
	leaq	16(%rdx),%rdx
	subq	$1,%rbp
	jnz	loop_3
	movq	8(%rsp),%rsi
	call	mont_mul_a3b



	movq	48(%rsp),%rcx
	subq	$5,%rcx
	movq	%rcx,48(%rsp)
	jge	main_loop_a3b



end_main_loop_a3b:


	movq	8(%rsp),%rdx
	pxor	%xmm4,%xmm4
	movdqu	0(%rdx),%xmm0
	movdqu	16(%rdx),%xmm1
	movdqu	32(%rdx),%xmm2
	movdqu	48(%rdx),%xmm3
	movdqa	%xmm4,576(%rsp)
	movdqa	%xmm4,592(%rsp)
	movdqa	%xmm4,608(%rsp)
	movdqa	%xmm4,624(%rsp)
	movdqa	%xmm0,512(%rsp)
	movdqa	%xmm1,528(%rsp)
	movdqa	%xmm2,544(%rsp)
	movdqa	%xmm3,560(%rsp)
	call	mont_reduce



	movq	8(%rsp),%rax
	movq	0(%rax),%r8
	movq	8(%rax),%r9
	movq	16(%rax),%r10
	movq	24(%rax),%r11
	movq	32(%rax),%r12
	movq	40(%rax),%r13
	movq	48(%rax),%r14
	movq	56(%rax),%r15


	movq	24(%rsp),%rbx
	addq	$512,%rbx

	subq	0(%rbx),%r8
	sbbq	8(%rbx),%r9
	sbbq	16(%rbx),%r10
	sbbq	24(%rbx),%r11
	sbbq	32(%rbx),%r12
	sbbq	40(%rbx),%r13
	sbbq	48(%rbx),%r14
	sbbq	56(%rbx),%r15


	movq	0(%rax),%rsi
	movq	8(%rax),%rdi
	movq	16(%rax),%rcx
	movq	24(%rax),%rdx
	cmovncq	%r8,%rsi
	cmovncq	%r9,%rdi
	cmovncq	%r10,%rcx
	cmovncq	%r11,%rdx
	movq	%rsi,0(%rax)
	movq	%rdi,8(%rax)
	movq	%rcx,16(%rax)
	movq	%rdx,24(%rax)

	movq	32(%rax),%rsi
	movq	40(%rax),%rdi
	movq	48(%rax),%rcx
	movq	56(%rax),%rdx
	cmovncq	%r12,%rsi
	cmovncq	%r13,%rdi
	cmovncq	%r14,%rcx
	cmovncq	%r15,%rdx
	movq	%rsi,32(%rax)
	movq	%rdi,40(%rax)
	movq	%rcx,48(%rax)
	movq	%rdx,56(%rax)

	movq	0(%rsp),%rsi
	movq	0(%rsi),%r15
	movq	8(%rsi),%r14
	movq	16(%rsi),%r13
	movq	24(%rsi),%r12
	movq	32(%rsi),%rbx
	movq	40(%rsi),%rbp
	leaq	48(%rsi),%rsp
L$epilogue:
	.byte	0xf3,0xc3

#endif