%ifidn __OUTPUT_FORMAT__,obj
section	code	use32 class=code align=64
%elifidn __OUTPUT_FORMAT__,win32
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01010000h
%error yasm version 1.1.0 or later needed.
%endif
; Yasm automatically includes .00 and complains about redefining it.
; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
%else
$@feat.00 equ 1
%endif
section	.text	code align=64
%else
section	.text	code
%endif
global	_bn_mul_comba8
align	16
_bn_mul_comba8:
L$_bn_mul_comba8_begin:
	push	esi
	mov	esi,DWORD [12+esp]
	push	edi
	mov	edi,DWORD [20+esp]
	push	ebp
	push	ebx
	xor	ebx,ebx
	mov	eax,DWORD [esi]
	xor	ecx,ecx
	mov	edx,DWORD [edi]
	; ################## Calculate word 0
	xor	ebp,ebp
	; mul a[0]*b[0]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [edi]
	adc	ebp,0
	mov	DWORD [eax],ebx
	mov	eax,DWORD [4+esi]
	; saved r[0]
	; ################## Calculate word 1
	xor	ebx,ebx
	; mul a[1]*b[0]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [esi]
	adc	ebp,edx
	mov	edx,DWORD [4+edi]
	adc	ebx,0
	; mul a[0]*b[1]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [edi]
	adc	ebx,0
	mov	DWORD [4+eax],ecx
	mov	eax,DWORD [8+esi]
	; saved r[1]
	; ################## Calculate word 2
	xor	ecx,ecx
	; mul a[2]*b[0]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [4+esi]
	adc	ebx,edx
	mov	edx,DWORD [4+edi]
	adc	ecx,0
	; mul a[1]*b[1]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [esi]
	adc	ebx,edx
	mov	edx,DWORD [8+edi]
	adc	ecx,0
	; mul a[0]*b[2]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [edi]
	adc	ecx,0
	mov	DWORD [8+eax],ebp
	mov	eax,DWORD [12+esi]
	; saved r[2]
	; ################## Calculate word 3
	xor	ebp,ebp
	; mul a[3]*b[0]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [8+esi]
	adc	ecx,edx
	mov	edx,DWORD [4+edi]
	adc	ebp,0
	; mul a[2]*b[1]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [4+esi]
	adc	ecx,edx
	mov	edx,DWORD [8+edi]
	adc	ebp,0
	; mul a[1]*b[2]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [esi]
	adc	ecx,edx
	mov	edx,DWORD [12+edi]
	adc	ebp,0
	; mul a[0]*b[3]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [edi]
	adc	ebp,0
	mov	DWORD [12+eax],ebx
	mov	eax,DWORD [16+esi]
	; saved r[3]
	; ################## Calculate word 4
	xor	ebx,ebx
	; mul a[4]*b[0]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [12+esi]
	adc	ebp,edx
	mov	edx,DWORD [4+edi]
	adc	ebx,0
	; mul a[3]*b[1]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [8+esi]
	adc	ebp,edx
	mov	edx,DWORD [8+edi]
	adc	ebx,0
	; mul a[2]*b[2]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [4+esi]
	adc	ebp,edx
	mov	edx,DWORD [12+edi]
	adc	ebx,0
	; mul a[1]*b[3]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [esi]
	adc	ebp,edx
	mov	edx,DWORD [16+edi]
	adc	ebx,0
	; mul a[0]*b[4]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [edi]
	adc	ebx,0
	mov	DWORD [16+eax],ecx
	mov	eax,DWORD [20+esi]
	; saved r[4]
	; ################## Calculate word 5
	xor	ecx,ecx
	; mul a[5]*b[0]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [16+esi]
	adc	ebx,edx
	mov	edx,DWORD [4+edi]
	adc	ecx,0
	; mul a[4]*b[1]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [12+esi]
	adc	ebx,edx
	mov	edx,DWORD [8+edi]
	adc	ecx,0
	; mul a[3]*b[2]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [8+esi]
	adc	ebx,edx
	mov	edx,DWORD [12+edi]
	adc	ecx,0
	; mul a[2]*b[3]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [4+esi]
	adc	ebx,edx
	mov	edx,DWORD [16+edi]
	adc	ecx,0
	; mul a[1]*b[4]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [esi]
	adc	ebx,edx
	mov	edx,DWORD [20+edi]
	adc	ecx,0
	; mul a[0]*b[5]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [edi]
	adc	ecx,0
	mov	DWORD [20+eax],ebp
	mov	eax,DWORD [24+esi]
	; saved r[5]
	; ################## Calculate word 6
	xor	ebp,ebp
	; mul a[6]*b[0]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esi]
	adc	ecx,edx
	mov	edx,DWORD [4+edi]
	adc	ebp,0
	; mul a[5]*b[1]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [16+esi]
	adc	ecx,edx
	mov	edx,DWORD [8+edi]
	adc	ebp,0
	; mul a[4]*b[2]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [12+esi]
	adc	ecx,edx
	mov	edx,DWORD [12+edi]
	adc	ebp,0
	; mul a[3]*b[3]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [8+esi]
	adc	ecx,edx
	mov	edx,DWORD [16+edi]
	adc	ebp,0
	; mul a[2]*b[4]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [4+esi]
	adc	ecx,edx
	mov	edx,DWORD [20+edi]
	adc	ebp,0
	; mul a[1]*b[5]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [esi]
	adc	ecx,edx
	mov	edx,DWORD [24+edi]
	adc	ebp,0
	; mul a[0]*b[6]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [edi]
	adc	ebp,0
	mov	DWORD [24+eax],ebx
	mov	eax,DWORD [28+esi]
	; saved r[6]
	; ################## Calculate word 7
	xor	ebx,ebx
	; mul a[7]*b[0]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [24+esi]
	adc	ebp,edx
	mov	edx,DWORD [4+edi]
	adc	ebx,0
	; mul a[6]*b[1]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esi]
	adc	ebp,edx
	mov	edx,DWORD [8+edi]
	adc	ebx,0
	; mul a[5]*b[2]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [16+esi]
	adc	ebp,edx
	mov	edx,DWORD [12+edi]
	adc	ebx,0
	; mul a[4]*b[3]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [12+esi]
	adc	ebp,edx
	mov	edx,DWORD [16+edi]
	adc	ebx,0
	; mul a[3]*b[4]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [8+esi]
	adc	ebp,edx
	mov	edx,DWORD [20+edi]
	adc	ebx,0
	; mul a[2]*b[5]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [4+esi]
	adc	ebp,edx
	mov	edx,DWORD [24+edi]
	adc	ebx,0
	; mul a[1]*b[6]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [esi]
	adc	ebp,edx
	mov	edx,DWORD [28+edi]
	adc	ebx,0
	; mul a[0]*b[7]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [4+edi]
	adc	ebx,0
	mov	DWORD [28+eax],ecx
	mov	eax,DWORD [28+esi]
	; saved r[7]
	; ################## Calculate word 8
	xor	ecx,ecx
	; mul a[7]*b[1]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [24+esi]
	adc	ebx,edx
	mov	edx,DWORD [8+edi]
	adc	ecx,0
	; mul a[6]*b[2]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esi]
	adc	ebx,edx
	mov	edx,DWORD [12+edi]
	adc	ecx,0
	; mul a[5]*b[3]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [16+esi]
	adc	ebx,edx
	mov	edx,DWORD [16+edi]
	adc	ecx,0
	; mul a[4]*b[4]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [12+esi]
	adc	ebx,edx
	mov	edx,DWORD [20+edi]
	adc	ecx,0
	; mul a[3]*b[5]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [8+esi]
	adc	ebx,edx
	mov	edx,DWORD [24+edi]
	adc	ecx,0
	; mul a[2]*b[6]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [4+esi]
	adc	ebx,edx
	mov	edx,DWORD [28+edi]
	adc	ecx,0
	; mul a[1]*b[7]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [8+edi]
	adc	ecx,0
	mov	DWORD [32+eax],ebp
	mov	eax,DWORD [28+esi]
	; saved r[8]
	; ################## Calculate word 9
	xor	ebp,ebp
	; mul a[7]*b[2]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [24+esi]
	adc	ecx,edx
	mov	edx,DWORD [12+edi]
	adc	ebp,0
	; mul a[6]*b[3]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esi]
	adc	ecx,edx
	mov	edx,DWORD [16+edi]
	adc	ebp,0
	; mul a[5]*b[4]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [16+esi]
	adc	ecx,edx
	mov	edx,DWORD [20+edi]
	adc	ebp,0
	; mul a[4]*b[5]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [12+esi]
	adc	ecx,edx
	mov	edx,DWORD [24+edi]
	adc	ebp,0
	; mul a[3]*b[6]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [8+esi]
	adc	ecx,edx
	mov	edx,DWORD [28+edi]
	adc	ebp,0
	; mul a[2]*b[7]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [12+edi]
	adc	ebp,0
	mov	DWORD [36+eax],ebx
	mov	eax,DWORD [28+esi]
	; saved r[9]
	; ################## Calculate word 10
	xor	ebx,ebx
	; mul a[7]*b[3]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [24+esi]
	adc	ebp,edx
	mov	edx,DWORD [16+edi]
	adc	ebx,0
	; mul a[6]*b[4]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esi]
	adc	ebp,edx
	mov	edx,DWORD [20+edi]
	adc	ebx,0
	; mul a[5]*b[5]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [16+esi]
	adc	ebp,edx
	mov	edx,DWORD [24+edi]
	adc	ebx,0
	; mul a[4]*b[6]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [12+esi]
	adc	ebp,edx
	mov	edx,DWORD [28+edi]
	adc	ebx,0
	; mul a[3]*b[7]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [16+edi]
	adc	ebx,0
	mov	DWORD [40+eax],ecx
	mov	eax,DWORD [28+esi]
	; saved r[10]
	; ################## Calculate word 11
	xor	ecx,ecx
	; mul a[7]*b[4]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [24+esi]
	adc	ebx,edx
	mov	edx,DWORD [20+edi]
	adc	ecx,0
	; mul a[6]*b[5]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esi]
	adc	ebx,edx
	mov	edx,DWORD [24+edi]
	adc	ecx,0
	; mul a[5]*b[6]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [16+esi]
	adc	ebx,edx
	mov	edx,DWORD [28+edi]
	adc	ecx,0
	; mul a[4]*b[7]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [20+edi]
	adc	ecx,0
	mov	DWORD [44+eax],ebp
	mov	eax,DWORD [28+esi]
	; saved r[11]
	; ################## Calculate word 12
	xor	ebp,ebp
	; mul a[7]*b[5]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [24+esi]
	adc	ecx,edx
	mov	edx,DWORD [24+edi]
	adc	ebp,0
	; mul a[6]*b[6]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esi]
	adc	ecx,edx
	mov	edx,DWORD [28+edi]
	adc	ebp,0
	; mul a[5]*b[7]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [24+edi]
	adc	ebp,0
	mov	DWORD [48+eax],ebx
	mov	eax,DWORD [28+esi]
	; saved r[12]
	; ################## Calculate word 13
	xor	ebx,ebx
	; mul a[7]*b[6]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [24+esi]
	adc	ebp,edx
	mov	edx,DWORD [28+edi]
	adc	ebx,0
	; mul a[6]*b[7]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [28+edi]
	adc	ebx,0
	mov	DWORD [52+eax],ecx
	mov	eax,DWORD [28+esi]
	; saved r[13]
	; ################## Calculate word 14
	xor	ecx,ecx
	; mul a[7]*b[7]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	adc	ecx,0
	mov	DWORD [56+eax],ebp
	; saved r[14]
	; save r[15]
	mov	DWORD [60+eax],ebx
	pop	ebx
	pop	ebp
	pop	edi
	pop	esi
	ret
global	_bn_mul_comba4
align	16
_bn_mul_comba4:
L$_bn_mul_comba4_begin:
	push	esi
	mov	esi,DWORD [12+esp]
	push	edi
	mov	edi,DWORD [20+esp]
	push	ebp
	push	ebx
	xor	ebx,ebx
	mov	eax,DWORD [esi]
	xor	ecx,ecx
	mov	edx,DWORD [edi]
	; ################## Calculate word 0
	xor	ebp,ebp
	; mul a[0]*b[0]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [edi]
	adc	ebp,0
	mov	DWORD [eax],ebx
	mov	eax,DWORD [4+esi]
	; saved r[0]
	; ################## Calculate word 1
	xor	ebx,ebx
	; mul a[1]*b[0]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [esi]
	adc	ebp,edx
	mov	edx,DWORD [4+edi]
	adc	ebx,0
	; mul a[0]*b[1]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [edi]
	adc	ebx,0
	mov	DWORD [4+eax],ecx
	mov	eax,DWORD [8+esi]
	; saved r[1]
	; ################## Calculate word 2
	xor	ecx,ecx
	; mul a[2]*b[0]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [4+esi]
	adc	ebx,edx
	mov	edx,DWORD [4+edi]
	adc	ecx,0
	; mul a[1]*b[1]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [esi]
	adc	ebx,edx
	mov	edx,DWORD [8+edi]
	adc	ecx,0
	; mul a[0]*b[2]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [edi]
	adc	ecx,0
	mov	DWORD [8+eax],ebp
	mov	eax,DWORD [12+esi]
	; saved r[2]
	; ################## Calculate word 3
	xor	ebp,ebp
	; mul a[3]*b[0]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [8+esi]
	adc	ecx,edx
	mov	edx,DWORD [4+edi]
	adc	ebp,0
	; mul a[2]*b[1]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [4+esi]
	adc	ecx,edx
	mov	edx,DWORD [8+edi]
	adc	ebp,0
	; mul a[1]*b[2]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [esi]
	adc	ecx,edx
	mov	edx,DWORD [12+edi]
	adc	ebp,0
	; mul a[0]*b[3]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	mov	edx,DWORD [4+edi]
	adc	ebp,0
	mov	DWORD [12+eax],ebx
	mov	eax,DWORD [12+esi]
	; saved r[3]
	; ################## Calculate word 4
	xor	ebx,ebx
	; mul a[3]*b[1]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [8+esi]
	adc	ebp,edx
	mov	edx,DWORD [8+edi]
	adc	ebx,0
	; mul a[2]*b[2]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [4+esi]
	adc	ebp,edx
	mov	edx,DWORD [12+edi]
	adc	ebx,0
	; mul a[1]*b[3]
	mul	edx
	add	ecx,eax
	mov	eax,DWORD [20+esp]
	adc	ebp,edx
	mov	edx,DWORD [8+edi]
	adc	ebx,0
	mov	DWORD [16+eax],ecx
	mov	eax,DWORD [12+esi]
	; saved r[4]
	; ################## Calculate word 5
	xor	ecx,ecx
	; mul a[3]*b[2]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [8+esi]
	adc	ebx,edx
	mov	edx,DWORD [12+edi]
	adc	ecx,0
	; mul a[2]*b[3]
	mul	edx
	add	ebp,eax
	mov	eax,DWORD [20+esp]
	adc	ebx,edx
	mov	edx,DWORD [12+edi]
	adc	ecx,0
	mov	DWORD [20+eax],ebp
	mov	eax,DWORD [12+esi]
	; saved r[5]
	; ################## Calculate word 6
	xor	ebp,ebp
	; mul a[3]*b[3]
	mul	edx
	add	ebx,eax
	mov	eax,DWORD [20+esp]
	adc	ecx,edx
	adc	ebp,0
	mov	DWORD [24+eax],ebx
	; saved r[6]
	; save r[7]
	mov	DWORD [28+eax],ecx
	pop	ebx
	pop	ebp
	pop	edi
	pop	esi
	ret
global	_bn_sqr_comba8
align	16
_bn_sqr_comba8:
L$_bn_sqr_comba8_begin:
	push	esi
	push	edi
	push	ebp
	push	ebx
	mov	edi,DWORD [20+esp]
	mov	esi,DWORD [24+esp]
	xor	ebx,ebx
	xor	ecx,ecx
	mov	eax,DWORD [esi]
	; ############### Calculate word 0
	xor	ebp,ebp
	; sqr a[0]*a[0]
	mul	eax
	add	ebx,eax
	adc	ecx,edx
	mov	edx,DWORD [esi]
	adc	ebp,0
	mov	DWORD [edi],ebx
	mov	eax,DWORD [4+esi]
	; saved r[0]
	; ############### Calculate word 1
	xor	ebx,ebx
	; sqr a[1]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [8+esi]
	adc	ebx,0
	mov	DWORD [4+edi],ecx
	mov	edx,DWORD [esi]
	; saved r[1]
	; ############### Calculate word 2
	xor	ecx,ecx
	; sqr a[2]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [4+esi]
	adc	ecx,0
	; sqr a[1]*a[1]
	mul	eax
	add	ebp,eax
	adc	ebx,edx
	mov	edx,DWORD [esi]
	adc	ecx,0
	mov	DWORD [8+edi],ebp
	mov	eax,DWORD [12+esi]
	; saved r[2]
	; ############### Calculate word 3
	xor	ebp,ebp
	; sqr a[3]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [8+esi]
	adc	ebp,0
	mov	edx,DWORD [4+esi]
	; sqr a[2]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [16+esi]
	adc	ebp,0
	mov	DWORD [12+edi],ebx
	mov	edx,DWORD [esi]
	; saved r[3]
	; ############### Calculate word 4
	xor	ebx,ebx
	; sqr a[4]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [12+esi]
	adc	ebx,0
	mov	edx,DWORD [4+esi]
	; sqr a[3]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [8+esi]
	adc	ebx,0
	; sqr a[2]*a[2]
	mul	eax
	add	ecx,eax
	adc	ebp,edx
	mov	edx,DWORD [esi]
	adc	ebx,0
	mov	DWORD [16+edi],ecx
	mov	eax,DWORD [20+esi]
	; saved r[4]
	; ############### Calculate word 5
	xor	ecx,ecx
	; sqr a[5]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [16+esi]
	adc	ecx,0
	mov	edx,DWORD [4+esi]
	; sqr a[4]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [12+esi]
	adc	ecx,0
	mov	edx,DWORD [8+esi]
	; sqr a[3]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [24+esi]
	adc	ecx,0
	mov	DWORD [20+edi],ebp
	mov	edx,DWORD [esi]
	; saved r[5]
	; ############### Calculate word 6
	xor	ebp,ebp
	; sqr a[6]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [20+esi]
	adc	ebp,0
	mov	edx,DWORD [4+esi]
	; sqr a[5]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [16+esi]
	adc	ebp,0
	mov	edx,DWORD [8+esi]
	; sqr a[4]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [12+esi]
	adc	ebp,0
	; sqr a[3]*a[3]
	mul	eax
	add	ebx,eax
	adc	ecx,edx
	mov	edx,DWORD [esi]
	adc	ebp,0
	mov	DWORD [24+edi],ebx
	mov	eax,DWORD [28+esi]
	; saved r[6]
	; ############### Calculate word 7
	xor	ebx,ebx
	; sqr a[7]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [24+esi]
	adc	ebx,0
	mov	edx,DWORD [4+esi]
	; sqr a[6]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [20+esi]
	adc	ebx,0
	mov	edx,DWORD [8+esi]
	; sqr a[5]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [16+esi]
	adc	ebx,0
	mov	edx,DWORD [12+esi]
	; sqr a[4]*a[3]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [28+esi]
	adc	ebx,0
	mov	DWORD [28+edi],ecx
	mov	edx,DWORD [4+esi]
	; saved r[7]
	; ############### Calculate word 8
	xor	ecx,ecx
	; sqr a[7]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [24+esi]
	adc	ecx,0
	mov	edx,DWORD [8+esi]
	; sqr a[6]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [20+esi]
	adc	ecx,0
	mov	edx,DWORD [12+esi]
	; sqr a[5]*a[3]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [16+esi]
	adc	ecx,0
	; sqr a[4]*a[4]
	mul	eax
	add	ebp,eax
	adc	ebx,edx
	mov	edx,DWORD [8+esi]
	adc	ecx,0
	mov	DWORD [32+edi],ebp
	mov	eax,DWORD [28+esi]
	; saved r[8]
	; ############### Calculate word 9
	xor	ebp,ebp
	; sqr a[7]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [24+esi]
	adc	ebp,0
	mov	edx,DWORD [12+esi]
	; sqr a[6]*a[3]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [20+esi]
	adc	ebp,0
	mov	edx,DWORD [16+esi]
	; sqr a[5]*a[4]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [28+esi]
	adc	ebp,0
	mov	DWORD [36+edi],ebx
	mov	edx,DWORD [12+esi]
	; saved r[9]
	; ############### Calculate word 10
	xor	ebx,ebx
	; sqr a[7]*a[3]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [24+esi]
	adc	ebx,0
	mov	edx,DWORD [16+esi]
	; sqr a[6]*a[4]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [20+esi]
	adc	ebx,0
	; sqr a[5]*a[5]
	mul	eax
	add	ecx,eax
	adc	ebp,edx
	mov	edx,DWORD [16+esi]
	adc	ebx,0
	mov	DWORD [40+edi],ecx
	mov	eax,DWORD [28+esi]
	; saved r[10]
	; ############### Calculate word 11
	xor	ecx,ecx
	; sqr a[7]*a[4]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [24+esi]
	adc	ecx,0
	mov	edx,DWORD [20+esi]
	; sqr a[6]*a[5]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [28+esi]
	adc	ecx,0
	mov	DWORD [44+edi],ebp
	mov	edx,DWORD [20+esi]
	; saved r[11]
	; ############### Calculate word 12
	xor	ebp,ebp
	; sqr a[7]*a[5]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [24+esi]
	adc	ebp,0
	; sqr a[6]*a[6]
	mul	eax
	add	ebx,eax
	adc	ecx,edx
	mov	edx,DWORD [24+esi]
	adc	ebp,0
	mov	DWORD [48+edi],ebx
	mov	eax,DWORD [28+esi]
	; saved r[12]
	; ############### Calculate word 13
	xor	ebx,ebx
	; sqr a[7]*a[6]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [28+esi]
	adc	ebx,0
	mov	DWORD [52+edi],ecx
	; saved r[13]
	; ############### Calculate word 14
	xor	ecx,ecx
	; sqr a[7]*a[7]
	mul	eax
	add	ebp,eax
	adc	ebx,edx
	adc	ecx,0
	mov	DWORD [56+edi],ebp
	; saved r[14]
	mov	DWORD [60+edi],ebx
	pop	ebx
	pop	ebp
	pop	edi
	pop	esi
	ret
global	_bn_sqr_comba4
align	16
_bn_sqr_comba4:
L$_bn_sqr_comba4_begin:
	push	esi
	push	edi
	push	ebp
	push	ebx
	mov	edi,DWORD [20+esp]
	mov	esi,DWORD [24+esp]
	xor	ebx,ebx
	xor	ecx,ecx
	mov	eax,DWORD [esi]
	; ############### Calculate word 0
	xor	ebp,ebp
	; sqr a[0]*a[0]
	mul	eax
	add	ebx,eax
	adc	ecx,edx
	mov	edx,DWORD [esi]
	adc	ebp,0
	mov	DWORD [edi],ebx
	mov	eax,DWORD [4+esi]
	; saved r[0]
	; ############### Calculate word 1
	xor	ebx,ebx
	; sqr a[1]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [8+esi]
	adc	ebx,0
	mov	DWORD [4+edi],ecx
	mov	edx,DWORD [esi]
	; saved r[1]
	; ############### Calculate word 2
	xor	ecx,ecx
	; sqr a[2]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [4+esi]
	adc	ecx,0
	; sqr a[1]*a[1]
	mul	eax
	add	ebp,eax
	adc	ebx,edx
	mov	edx,DWORD [esi]
	adc	ecx,0
	mov	DWORD [8+edi],ebp
	mov	eax,DWORD [12+esi]
	; saved r[2]
	; ############### Calculate word 3
	xor	ebp,ebp
	; sqr a[3]*a[0]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [8+esi]
	adc	ebp,0
	mov	edx,DWORD [4+esi]
	; sqr a[2]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebp,0
	add	ebx,eax
	adc	ecx,edx
	mov	eax,DWORD [12+esi]
	adc	ebp,0
	mov	DWORD [12+edi],ebx
	mov	edx,DWORD [4+esi]
	; saved r[3]
	; ############### Calculate word 4
	xor	ebx,ebx
	; sqr a[3]*a[1]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ebx,0
	add	ecx,eax
	adc	ebp,edx
	mov	eax,DWORD [8+esi]
	adc	ebx,0
	; sqr a[2]*a[2]
	mul	eax
	add	ecx,eax
	adc	ebp,edx
	mov	edx,DWORD [8+esi]
	adc	ebx,0
	mov	DWORD [16+edi],ecx
	mov	eax,DWORD [12+esi]
	; saved r[4]
	; ############### Calculate word 5
	xor	ecx,ecx
	; sqr a[3]*a[2]
	mul	edx
	add	eax,eax
	adc	edx,edx
	adc	ecx,0
	add	ebp,eax
	adc	ebx,edx
	mov	eax,DWORD [12+esi]
	adc	ecx,0
	mov	DWORD [20+edi],ebp
	; saved r[5]
	; ############### Calculate word 6
	xor	ebp,ebp
	; sqr a[3]*a[3]
	mul	eax
	add	ebx,eax
	adc	ecx,edx
	adc	ebp,0
	mov	DWORD [24+edi],ebx
	; saved r[6]
	mov	DWORD [28+edi],ecx
	pop	ebx
	pop	ebp
	pop	edi
	pop	esi
	ret