.set	mips2
.rdata
.asciiz	"mips3.s, Version 1.2"
.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

.text
.set	noat

.align	5
.globl	bn_mul_add_words
.ent	bn_mul_add_words
bn_mul_add_words:
	.set	noreorder
	bgtz	$6,bn_mul_add_words_internal
	move	$2,$0
	jr	$31
	move	$4,$2
.end	bn_mul_add_words

.align	5
.ent	bn_mul_add_words_internal
bn_mul_add_words_internal:
	.set	reorder
	li	$3,-4
	and	$8,$6,$3
	beqz	$8,.L_bn_mul_add_words_tail

.L_bn_mul_add_words_loop:
	lw	$12,0($5)
	multu	$12,$7
	lw	$13,0($4)
	lw	$14,4($5)
	lw	$15,4($4)
	lw	$8,2*4($5)
	lw	$9,2*4($4)
	addu	$13,$2
	sltu	$2,$13,$2	# All manuals say it "compares 32-bit
				# values", but it seems to work fine
				# even on 64-bit registers.
	mflo	$1
	mfhi	$12
	addu	$13,$1
	addu	$2,$12
	 multu	$14,$7
	sltu	$1,$13,$1
	sw	$13,0($4)
	addu	$2,$1

	lw	$10,3*4($5)
	lw	$11,3*4($4)
	addu	$15,$2
	sltu	$2,$15,$2
	mflo	$1
	mfhi	$14
	addu	$15,$1
	addu	$2,$14
	 multu	$8,$7
	sltu	$1,$15,$1
	sw	$15,4($4)
	addu	$2,$1

	subu	$6,4
	addu $4,4*4
	addu $5,4*4
	addu	$9,$2
	sltu	$2,$9,$2
	mflo	$1
	mfhi	$8
	addu	$9,$1
	addu	$2,$8
	 multu	$10,$7
	sltu	$1,$9,$1
	sw	$9,-2*4($4)
	addu	$2,$1


	and	$8,$6,$3
	addu	$11,$2
	sltu	$2,$11,$2
	mflo	$1
	mfhi	$10
	addu	$11,$1
	addu	$2,$10
	sltu	$1,$11,$1
	sw	$11,-4($4)
	.set	noreorder
	bgtz	$8,.L_bn_mul_add_words_loop
	addu	$2,$1

	beqz	$6,.L_bn_mul_add_words_return
	nop

.L_bn_mul_add_words_tail:
	.set	reorder
	lw	$12,0($5)
	multu	$12,$7
	lw	$13,0($4)
	subu	$6,1
	addu	$13,$2
	sltu	$2,$13,$2
	mflo	$1
	mfhi	$12
	addu	$13,$1
	addu	$2,$12
	sltu	$1,$13,$1
	sw	$13,0($4)
	addu	$2,$1
	beqz	$6,.L_bn_mul_add_words_return

	lw	$12,4($5)
	multu	$12,$7
	lw	$13,4($4)
	subu	$6,1
	addu	$13,$2
	sltu	$2,$13,$2
	mflo	$1
	mfhi	$12
	addu	$13,$1
	addu	$2,$12
	sltu	$1,$13,$1
	sw	$13,4($4)
	addu	$2,$1
	beqz	$6,.L_bn_mul_add_words_return

	lw	$12,2*4($5)
	multu	$12,$7
	lw	$13,2*4($4)
	addu	$13,$2
	sltu	$2,$13,$2
	mflo	$1
	mfhi	$12
	addu	$13,$1
	addu	$2,$12
	sltu	$1,$13,$1
	sw	$13,2*4($4)
	addu	$2,$1

.L_bn_mul_add_words_return:
	.set	noreorder
	jr	$31
	move	$4,$2
.end	bn_mul_add_words_internal

.align	5
.globl	bn_mul_words
.ent	bn_mul_words
bn_mul_words:
	.set	noreorder
	bgtz	$6,bn_mul_words_internal
	move	$2,$0
	jr	$31
	move	$4,$2
.end	bn_mul_words

.align	5
.ent	bn_mul_words_internal
bn_mul_words_internal:
	.set	reorder
	li	$3,-4
	and	$8,$6,$3
	beqz	$8,.L_bn_mul_words_tail

.L_bn_mul_words_loop:
	lw	$12,0($5)
	multu	$12,$7
	lw	$14,4($5)
	lw	$8,2*4($5)
	lw	$10,3*4($5)
	mflo	$1
	mfhi	$12
	addu	$2,$1
	sltu	$13,$2,$1
	 multu	$14,$7
	sw	$2,0($4)
	addu	$2,$13,$12

	subu	$6,4
	addu $4,4*4
	addu $5,4*4
	mflo	$1
	mfhi	$14
	addu	$2,$1
	sltu	$15,$2,$1
	 multu	$8,$7
	sw	$2,-3*4($4)
	addu	$2,$15,$14

	mflo	$1
	mfhi	$8
	addu	$2,$1
	sltu	$9,$2,$1
	 multu	$10,$7
	sw	$2,-2*4($4)
	addu	$2,$9,$8

	and	$8,$6,$3
	mflo	$1
	mfhi	$10
	addu	$2,$1
	sltu	$11,$2,$1
	sw	$2,-4($4)
	.set	noreorder
	bgtz	$8,.L_bn_mul_words_loop
	addu	$2,$11,$10

	beqz	$6,.L_bn_mul_words_return
	nop

.L_bn_mul_words_tail:
	.set	reorder
	lw	$12,0($5)
	multu	$12,$7
	subu	$6,1
	mflo	$1
	mfhi	$12
	addu	$2,$1
	sltu	$13,$2,$1
	sw	$2,0($4)
	addu	$2,$13,$12
	beqz	$6,.L_bn_mul_words_return

	lw	$12,4($5)
	multu	$12,$7
	subu	$6,1
	mflo	$1
	mfhi	$12
	addu	$2,$1
	sltu	$13,$2,$1
	sw	$2,4($4)
	addu	$2,$13,$12
	beqz	$6,.L_bn_mul_words_return

	lw	$12,2*4($5)
	multu	$12,$7
	mflo	$1
	mfhi	$12
	addu	$2,$1
	sltu	$13,$2,$1
	sw	$2,2*4($4)
	addu	$2,$13,$12

.L_bn_mul_words_return:
	.set	noreorder
	jr	$31
	move	$4,$2
.end	bn_mul_words_internal

.align	5
.globl	bn_sqr_words
.ent	bn_sqr_words
bn_sqr_words:
	.set	noreorder
	bgtz	$6,bn_sqr_words_internal
	move	$2,$0
	jr	$31
	move	$4,$2
.end	bn_sqr_words

.align	5
.ent	bn_sqr_words_internal
bn_sqr_words_internal:
	.set	reorder
	li	$3,-4
	and	$8,$6,$3
	beqz	$8,.L_bn_sqr_words_tail

.L_bn_sqr_words_loop:
	lw	$12,0($5)
	multu	$12,$12
	lw	$14,4($5)
	lw	$8,2*4($5)
	lw	$10,3*4($5)
	mflo	$13
	mfhi	$12
	sw	$13,0($4)
	sw	$12,4($4)

	multu	$14,$14
	subu	$6,4
	addu $4,8*4
	addu $5,4*4
	mflo	$15
	mfhi	$14
	sw	$15,-6*4($4)
	sw	$14,-5*4($4)

	multu	$8,$8
	mflo	$9
	mfhi	$8
	sw	$9,-4*4($4)
	sw	$8,-3*4($4)


	multu	$10,$10
	and	$8,$6,$3
	mflo	$11
	mfhi	$10
	sw	$11,-2*4($4)

	.set	noreorder
	bgtz	$8,.L_bn_sqr_words_loop
	sw	$10,-4($4)

	beqz	$6,.L_bn_sqr_words_return
	nop

.L_bn_sqr_words_tail:
	.set	reorder
	lw	$12,0($5)
	multu	$12,$12
	subu	$6,1
	mflo	$13
	mfhi	$12
	sw	$13,0($4)
	sw	$12,4($4)
	beqz	$6,.L_bn_sqr_words_return

	lw	$12,4($5)
	multu	$12,$12
	subu	$6,1
	mflo	$13
	mfhi	$12
	sw	$13,2*4($4)
	sw	$12,3*4($4)
	beqz	$6,.L_bn_sqr_words_return

	lw	$12,2*4($5)
	multu	$12,$12
	mflo	$13
	mfhi	$12
	sw	$13,4*4($4)
	sw	$12,5*4($4)

.L_bn_sqr_words_return:
	.set	noreorder
	jr	$31
	move	$4,$2

.end	bn_sqr_words_internal

.align	5
.globl	bn_add_words
.ent	bn_add_words
bn_add_words:
	.set	noreorder
	bgtz	$7,bn_add_words_internal
	move	$2,$0
	jr	$31
	move	$4,$2
.end	bn_add_words

.align	5
.ent	bn_add_words_internal
bn_add_words_internal:
	.set	reorder
	li	$3,-4
	and	$1,$7,$3
	beqz	$1,.L_bn_add_words_tail

.L_bn_add_words_loop:
	lw	$12,0($5)
	lw	$8,0($6)
	subu	$7,4
	lw	$13,4($5)
	and	$1,$7,$3
	lw	$14,2*4($5)
	addu $6,4*4
	lw	$15,3*4($5)
	addu $4,4*4
	lw	$9,-3*4($6)
	addu $5,4*4
	lw	$10,-2*4($6)
	lw	$11,-4($6)
	addu	$8,$12
	sltu	$24,$8,$12
	addu	$12,$8,$2
	sltu	$2,$12,$8
	sw	$12,-4*4($4)
	addu	$2,$24

	addu	$9,$13
	sltu	$25,$9,$13
	addu	$13,$9,$2
	sltu	$2,$13,$9
	sw	$13,-3*4($4)
	addu	$2,$25

	addu	$10,$14
	sltu	$24,$10,$14
	addu	$14,$10,$2
	sltu	$2,$14,$10
	sw	$14,-2*4($4)
	addu	$2,$24
	
	addu	$11,$15
	sltu	$25,$11,$15
	addu	$15,$11,$2
	sltu	$2,$15,$11
	sw	$15,-4($4)
	
	.set	noreorder
	bgtz	$1,.L_bn_add_words_loop
	addu	$2,$25

	beqz	$7,.L_bn_add_words_return
	nop

.L_bn_add_words_tail:
	.set	reorder
	lw	$12,0($5)
	lw	$8,0($6)
	addu	$8,$12
	subu	$7,1
	sltu	$24,$8,$12
	addu	$12,$8,$2
	sltu	$2,$12,$8
	sw	$12,0($4)
	addu	$2,$24
	beqz	$7,.L_bn_add_words_return

	lw	$13,4($5)
	lw	$9,4($6)
	addu	$9,$13
	subu	$7,1
	sltu	$25,$9,$13
	addu	$13,$9,$2
	sltu	$2,$13,$9
	sw	$13,4($4)
	addu	$2,$25
	beqz	$7,.L_bn_add_words_return

	lw	$14,2*4($5)
	lw	$10,2*4($6)
	addu	$10,$14
	sltu	$24,$10,$14
	addu	$14,$10,$2
	sltu	$2,$14,$10
	sw	$14,2*4($4)
	addu	$2,$24

.L_bn_add_words_return:
	.set	noreorder
	jr	$31
	move	$4,$2

.end	bn_add_words_internal

.align	5
.globl	bn_sub_words
.ent	bn_sub_words
bn_sub_words:
	.set	noreorder
	bgtz	$7,bn_sub_words_internal
	move	$2,$0
	jr	$31
	move	$4,$0
.end	bn_sub_words

.align	5
.ent	bn_sub_words_internal
bn_sub_words_internal:
	.set	reorder
	li	$3,-4
	and	$1,$7,$3
	beqz	$1,.L_bn_sub_words_tail

.L_bn_sub_words_loop:
	lw	$12,0($5)
	lw	$8,0($6)
	subu	$7,4
	lw	$13,4($5)
	and	$1,$7,$3
	lw	$14,2*4($5)
	addu $6,4*4
	lw	$15,3*4($5)
	addu $4,4*4
	lw	$9,-3*4($6)
	addu $5,4*4
	lw	$10,-2*4($6)
	lw	$11,-4($6)
	sltu	$24,$12,$8
	subu	$8,$12,$8
	subu	$12,$8,$2
	sgtu	$2,$12,$8
	sw	$12,-4*4($4)
	addu	$2,$24

	sltu	$25,$13,$9
	subu	$9,$13,$9
	subu	$13,$9,$2
	sgtu	$2,$13,$9
	sw	$13,-3*4($4)
	addu	$2,$25


	sltu	$24,$14,$10
	subu	$10,$14,$10
	subu	$14,$10,$2
	sgtu	$2,$14,$10
	sw	$14,-2*4($4)
	addu	$2,$24

	sltu	$25,$15,$11
	subu	$11,$15,$11
	subu	$15,$11,$2
	sgtu	$2,$15,$11
	sw	$15,-4($4)

	.set	noreorder
	bgtz	$1,.L_bn_sub_words_loop
	addu	$2,$25

	beqz	$7,.L_bn_sub_words_return
	nop

.L_bn_sub_words_tail:
	.set	reorder
	lw	$12,0($5)
	lw	$8,0($6)
	subu	$7,1
	sltu	$24,$12,$8
	subu	$8,$12,$8
	subu	$12,$8,$2
	sgtu	$2,$12,$8
	sw	$12,0($4)
	addu	$2,$24
	beqz	$7,.L_bn_sub_words_return

	lw	$13,4($5)
	subu	$7,1
	lw	$9,4($6)
	sltu	$25,$13,$9
	subu	$9,$13,$9
	subu	$13,$9,$2
	sgtu	$2,$13,$9
	sw	$13,4($4)
	addu	$2,$25
	beqz	$7,.L_bn_sub_words_return

	lw	$14,2*4($5)
	lw	$10,2*4($6)
	sltu	$24,$14,$10
	subu	$10,$14,$10
	subu	$14,$10,$2
	sgtu	$2,$14,$10
	sw	$14,2*4($4)
	addu	$2,$24

.L_bn_sub_words_return:
	.set	noreorder
	jr	$31
	move	$4,$2
.end	bn_sub_words_internal

.align 5
.globl	bn_div_3_words
.ent	bn_div_3_words
bn_div_3_words:
	.set	noreorder
	move	$7,$4		# we know that bn_div_words does not
				# touch $7, $10, $11 and preserves $6
				# so that we can save two arguments
				# and return address in registers
				# instead of stack:-)
				
	lw	$4,($7)
	move	$10,$5
	bne	$4,$6,bn_div_3_words_internal
	lw	$5,-4($7)
	li	$2,-1
	jr	$31
	move	$4,$2
.end	bn_div_3_words

.align	5
.ent	bn_div_3_words_internal
bn_div_3_words_internal:
	.set	reorder
	move	$11,$31
	bal	bn_div_words_internal
	move	$31,$11
	multu	$10,$2
	lw	$14,-2*4($7)
	move	$8,$0
	mfhi	$13
	mflo	$12
	sltu	$24,$13,$5
.L_bn_div_3_words_inner_loop:
	bnez	$24,.L_bn_div_3_words_inner_loop_done
	sgeu	$1,$14,$12
	seq	$25,$13,$5
	and	$1,$25
	sltu	$15,$12,$10
	addu	$5,$6
	subu	$13,$15
	subu	$12,$10
	sltu	$24,$13,$5
	sltu	$8,$5,$6
	or	$24,$8
	.set	noreorder
	beqz	$1,.L_bn_div_3_words_inner_loop
	subu	$2,1
	addu	$2,1
	.set	reorder
.L_bn_div_3_words_inner_loop_done:
	.set	noreorder
	jr	$31
	move	$4,$2
.end	bn_div_3_words_internal

.align	5
.globl	bn_div_words
.ent	bn_div_words
bn_div_words:
	.set	noreorder
	bnez	$6,bn_div_words_internal
	li	$2,-1		# I would rather signal div-by-zero
				# which can be done with 'break 7'
	jr	$31
	move	$4,$2
.end	bn_div_words

.align	5
.ent	bn_div_words_internal
bn_div_words_internal:
	move	$3,$0
	bltz	$6,.L_bn_div_words_body
	move	$25,$3
	sll	$6,1
	bgtz	$6,.-4
	addu	$25,1

	.set	reorder
	negu	$13,$25
	li	$14,-1
	sll	$14,$13
	and	$14,$4
	srl	$1,$5,$13
	.set	noreorder
	beqz	$14,.+12
	nop
	break	6		# signal overflow
	.set	reorder
	sll	$4,$25
	sll	$5,$25
	or	$4,$1
.L_bn_div_words_body:
	srl	$3,$6,4*4	# bits
	sgeu	$1,$4,$6
	.set	noreorder
	beqz	$1,.+12
	nop
	subu	$4,$6
	.set	reorder

	li	$8,-1
	srl	$9,$4,4*4	# bits
	srl	$8,4*4	# q=0xffffffff
	beq	$3,$9,.L_bn_div_words_skip_div1
	divu	$0,$4,$3
	mflo	$8
.L_bn_div_words_skip_div1:
	multu	$6,$8
	sll	$15,$4,4*4	# bits
	srl	$1,$5,4*4	# bits
	or	$15,$1
	mflo	$12
	mfhi	$13
.L_bn_div_words_inner_loop1:
	sltu	$14,$15,$12
	seq	$24,$9,$13
	sltu	$1,$9,$13
	and	$14,$24
	sltu	$2,$12,$6
	or	$1,$14
	.set	noreorder
	beqz	$1,.L_bn_div_words_inner_loop1_done
	subu	$13,$2
	subu	$12,$6
	b	.L_bn_div_words_inner_loop1
	subu	$8,1
	.set	reorder
.L_bn_div_words_inner_loop1_done:

	sll	$5,4*4	# bits
	subu	$4,$15,$12
	sll	$2,$8,4*4	# bits

	li	$8,-1
	srl	$9,$4,4*4	# bits
	srl	$8,4*4	# q=0xffffffff
	beq	$3,$9,.L_bn_div_words_skip_div2
	divu	$0,$4,$3
	mflo	$8
.L_bn_div_words_skip_div2:
	multu	$6,$8
	sll	$15,$4,4*4	# bits
	srl	$1,$5,4*4	# bits
	or	$15,$1
	mflo	$12
	mfhi	$13
.L_bn_div_words_inner_loop2:
	sltu	$14,$15,$12
	seq	$24,$9,$13
	sltu	$1,$9,$13
	and	$14,$24
	sltu	$3,$12,$6
	or	$1,$14
	.set	noreorder
	beqz	$1,.L_bn_div_words_inner_loop2_done
	subu	$13,$3
	subu	$12,$6
	b	.L_bn_div_words_inner_loop2
	subu	$8,1
	.set	reorder
.L_bn_div_words_inner_loop2_done:

	subu	$4,$15,$12
	or	$2,$8
	srl	$3,$4,$25	# $3 contains remainder if anybody wants it
	srl	$6,$25		# restore $6

	.set	noreorder
	move	$5,$3
	jr	$31
	move	$4,$2
.end	bn_div_words_internal

.align	5
.globl	bn_mul_comba8
.ent	bn_mul_comba8
bn_mul_comba8:
	.set	noreorder
	.frame	$29,6*4,$31
	.mask	0x003f0000,-4
	subu $29,6*4
	sw	$21,5*4($29)
	sw	$20,4*4($29)
	sw	$19,3*4($29)
	sw	$18,2*4($29)
	sw	$17,1*4($29)
	sw	$16,0*4($29)

	.set	reorder
	lw	$12,0($5)	# If compiled with -mips3 option on
				# R5000 box assembler barks on this
				# 1ine with "should not have mult/div
				# as last instruction in bb (R10K
				# bug)" warning. If anybody out there
				# has a clue about how to circumvent
				# this do send me a note.
				#		<appro@fy.chalmers.se>

	lw	$8,0($6)
	lw	$13,4($5)
	lw	$14,2*4($5)
	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
	lw	$15,3*4($5)
	lw	$9,4($6)
	lw	$10,2*4($6)
	lw	$11,3*4($6)
	mflo	$2
	mfhi	$3

	lw	$16,4*4($5)
	lw	$18,5*4($5)
	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
	lw	$20,6*4($5)
	lw	$5,7*4($5)
	lw	$17,4*4($6)
	lw	$19,5*4($6)
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
	addu	$7,$25,$1
	lw	$21,6*4($6)
	lw	$6,7*4($6)
	sw	$2,0($4)	# r[0]=c1;
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	sw	$3,4($4)	# r[1]=c2;

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,2*4($4)	# r[2]=c3;

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$7,$3,$25
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$16,$8		# mul_add_c(a[4],b[0],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,3*4($4)	# r[3]=c1;

	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$12,$17		# mul_add_c(a[0],b[4],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$12,$19		# mul_add_c(a[0],b[5],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,4*4($4)	# r[4]=c2;

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$13,$17		# mul_add_c(a[1],b[4],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$16,$9		# mul_add_c(a[4],b[1],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$18,$8		# mul_add_c(a[5],b[0],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$20,$8		# mul_add_c(a[6],b[0],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,5*4($4)	# r[5]=c3;

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$18,$9		# mul_add_c(a[5],b[1],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$7,$3,$25
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$16,$10		# mul_add_c(a[4],b[2],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$14,$17		# mul_add_c(a[2],b[4],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$13,$19		# mul_add_c(a[1],b[5],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$12,$21		# mul_add_c(a[0],b[6],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$12,$6		# mul_add_c(a[0],b[7],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,6*4($4)	# r[6]=c1;

	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$13,$21		# mul_add_c(a[1],b[6],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$14,$19		# mul_add_c(a[2],b[5],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$15,$17		# mul_add_c(a[3],b[4],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$16,$11		# mul_add_c(a[4],b[3],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$18,$10		# mul_add_c(a[5],b[2],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$20,$9		# mul_add_c(a[6],b[1],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$5,$8		# mul_add_c(a[7],b[0],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$5,$9		# mul_add_c(a[7],b[1],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,7*4($4)	# r[7]=c2;

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$20,$10		# mul_add_c(a[6],b[2],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$18,$11		# mul_add_c(a[5],b[3],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$16,$17		# mul_add_c(a[4],b[4],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$15,$19		# mul_add_c(a[3],b[5],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$14,$21		# mul_add_c(a[2],b[6],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$13,$6		# mul_add_c(a[1],b[7],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$14,$6		# mul_add_c(a[2],b[7],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,8*4($4)	# r[8]=c3;

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$15,$21		# mul_add_c(a[3],b[6],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$7,$3,$25
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$16,$19		# mul_add_c(a[4],b[5],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$18,$17		# mul_add_c(a[5],b[4],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$20,$11		# mul_add_c(a[6],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$5,$10		# mul_add_c(a[7],b[2],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$5,$11		# mul_add_c(a[7],b[3],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,9*4($4)	# r[9]=c1;

	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$20,$17		# mul_add_c(a[6],b[4],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$18,$19		# mul_add_c(a[5],b[5],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$16,$21		# mul_add_c(a[4],b[6],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$15,$6		# mul_add_c(a[3],b[7],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$16,$6		# mul_add_c(a[4],b[7],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,10*4($4)	# r[10]=c2;

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$18,$21		# mul_add_c(a[5],b[6],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$20,$19		# mul_add_c(a[6],b[5],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$5,$17		# mul_add_c(a[7],b[4],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$5,$19		# mul_add_c(a[7],b[5],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,11*4($4)	# r[11]=c3;

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$20,$21		# mul_add_c(a[6],b[6],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$7,$3,$25
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$18,$6		# mul_add_c(a[5],b[7],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$20,$6		# mul_add_c(a[6],b[7],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,12*4($4)	# r[12]=c1;

	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$5,$21		# mul_add_c(a[7],b[6],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$5,$6		# mul_add_c(a[7],b[7],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,13*4($4)	# r[13]=c2;

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sw	$7,14*4($4)	# r[14]=c3;
	sw	$2,15*4($4)	# r[15]=c1;

	.set	noreorder
	lw	$21,5*4($29)
	lw	$20,4*4($29)
	lw	$19,3*4($29)
	lw	$18,2*4($29)
	lw	$17,1*4($29)
	lw	$16,0*4($29)
	jr	$31
	addu $29,6*4
.end	bn_mul_comba8

.align	5
.globl	bn_mul_comba4
.ent	bn_mul_comba4
bn_mul_comba4:
	.set	reorder
	lw	$12,0($5)
	lw	$8,0($6)
	lw	$13,4($5)
	lw	$14,2*4($5)
	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
	lw	$15,3*4($5)
	lw	$9,4($6)
	lw	$10,2*4($6)
	lw	$11,3*4($6)
	mflo	$2
	mfhi	$3
	sw	$2,0($4)

	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
	addu	$7,$25,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	sw	$3,4($4)

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,2*4($4)

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$7,$3,$25
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,3*4($4)

	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$2,$7,$25
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,4*4($4)

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
	addu	$25,$1
	addu	$2,$25
	sltu	$3,$2,$25
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,5*4($4)

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sw	$2,6*4($4)
	sw	$3,7*4($4)

	.set	noreorder
	jr	$31
	nop
.end	bn_mul_comba4

.align	5
.globl	bn_sqr_comba8
.ent	bn_sqr_comba8
bn_sqr_comba8:
	.set	reorder
	lw	$12,0($5)
	lw	$13,4($5)
	lw	$14,2*4($5)
	lw	$15,3*4($5)

	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
	lw	$8,4*4($5)
	lw	$9,5*4($5)
	lw	$10,6*4($5)
	lw	$11,7*4($5)
	mflo	$2
	mfhi	$3
	sw	$2,0($4)

	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$7,$25,$1
	sw	$3,4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,2*4($4)

	mflo	$24
	mfhi	$25
	slt	$7,$25,$0
	sll	$25,1
	multu	$13,$14		# mul_add_c2(a[1],b[2],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	 multu	$8,$12		# mul_add_c2(a[4],b[0],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,3*4($4)

	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$2,$1
	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$12,$9		# mul_add_c2(a[0],b[5],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,4*4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	multu	$13,$8		# mul_add_c2(a[1],b[4],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$3,$1
	multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	 multu	$10,$12		# mul_add_c2(a[6],b[0],c1,c2,c3);
	addu	$3,$1
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,5*4($4)

	mflo	$24
	mfhi	$25
	slt	$7,$25,$0
	sll	$25,1
	multu	$9,$13		# mul_add_c2(a[5],b[1],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	multu	$8,$14		# mul_add_c2(a[4],b[2],c1,c2,c3);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$12,$11		# mul_add_c2(a[0],b[7],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,6*4($4)

	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	multu	$13,$10		# mul_add_c2(a[1],b[6],c2,c3,c1);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$2,$1
	multu	$14,$9		# mul_add_c2(a[2],b[5],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$2,$1
	multu	$15,$8		# mul_add_c2(a[3],b[4],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$2,$1
	 multu	$11,$13		# mul_add_c2(a[7],b[1],c3,c1,c2);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,7*4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	multu	$10,$14		# mul_add_c2(a[6],b[2],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$3,$1
	multu	$9,$15		# mul_add_c2(a[5],b[3],c3,c1,c2);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$3,$1
	multu	$8,$8		# mul_add_c(a[4],b[4],c3,c1,c2);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$14,$11		# mul_add_c2(a[2],b[7],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,8*4($4)

	mflo	$24
	mfhi	$25
	slt	$7,$25,$0
	sll	$25,1
	multu	$15,$10		# mul_add_c2(a[3],b[6],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	multu	$8,$9		# mul_add_c2(a[4],b[5],c1,c2,c3);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	 multu	$11,$15		# mul_add_c2(a[7],b[3],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,9*4($4)

	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	multu	$10,$8		# mul_add_c2(a[6],b[4],c2,c3,c1);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$2,$1
	multu	$9,$9		# mul_add_c(a[5],b[5],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$8,$11		# mul_add_c2(a[4],b[7],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,10*4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	multu	$9,$10		# mul_add_c2(a[5],b[6],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$3,$1
	 multu	$11,$9		# mul_add_c2(a[7],b[5],c1,c2,c3);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,11*4($4)

	mflo	$24
	mfhi	$25
	slt	$7,$25,$0
	sll	$25,1
	multu	$10,$10		# mul_add_c(a[6],b[6],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	 multu	$10,$11		# mul_add_c2(a[6],b[7],c2,c3,c1);
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,12*4($4)

	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	 multu	$11,$11		# mul_add_c(a[7],b[7],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,13*4($4)

	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sw	$7,14*4($4)
	sw	$2,15*4($4)

	.set	noreorder
	jr	$31
	nop
.end	bn_sqr_comba8

.align	5
.globl	bn_sqr_comba4
.ent	bn_sqr_comba4
bn_sqr_comba4:
	.set	reorder
	lw	$12,0($5)
	lw	$13,4($5)
	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
	lw	$14,2*4($5)
	lw	$15,3*4($5)
	mflo	$2
	mfhi	$3
	sw	$2,0($4)

	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$7,$25,$1
	sw	$3,4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	mflo	$24
	mfhi	$25
	addu	$7,$24
	sltu	$1,$7,$24
	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,2*4($4)

	mflo	$24
	mfhi	$25
	slt	$7,$25,$0
	sll	$25,1
	multu	$13,$14		# mul_add_c(a2[1],b[2],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	mflo	$24
	mfhi	$25
	slt	$1,$25,$0
	addu	$7,$1
	 multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
	sll	$25,1
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sltu	$1,$3,$25
	addu	$7,$1
	sw	$2,3*4($4)

	mflo	$24
	mfhi	$25
	slt	$2,$25,$0
	sll	$25,1
	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$3,$24
	sltu	$1,$3,$24
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	mflo	$24
	mfhi	$25
	addu	$3,$24
	sltu	$1,$3,$24
	 multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
	addu	$25,$1
	addu	$7,$25
	sltu	$1,$7,$25
	addu	$2,$1
	sw	$3,4*4($4)

	mflo	$24
	mfhi	$25
	slt	$3,$25,$0
	sll	$25,1
	 multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
	slt	$6,$24,$0
	addu	$25,$6
	sll	$24,1
	addu	$7,$24
	sltu	$1,$7,$24
	addu	$25,$1
	addu	$2,$25
	sltu	$1,$2,$25
	addu	$3,$1
	sw	$7,5*4($4)

	mflo	$24
	mfhi	$25
	addu	$2,$24
	sltu	$1,$2,$24
	addu	$25,$1
	addu	$3,$25
	sw	$2,6*4($4)
	sw	$3,7*4($4)

	.set	noreorder
	jr	$31
	nop
.end	bn_sqr_comba4