#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# SHA1 block procedure for Alpha.

# On 21264 performance is 33% better than code generated by vendor
# compiler, and 75% better than GCC [3.4], and in absolute terms is
# 8.7 cycles per processed byte. Implementation features vectorized
# byte swap, but not Xupdate.

@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
$ctx="a0";	# $16
$inp="a1";
$num="a2";
$A="a3";
$B="a4";	# 20
$C="a5";
$D="t8";
$E="t9";	@V=($A,$B,$C,$D,$E);
$t0="t10";	# 24
$t1="t11";
$t2="ra";
$t3="t12";
$K="AT";	# 28

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
	ldq_u	@X[0],0+0($inp)
	ldq_u	@X[1],0+7($inp)
___
$code.=<<___ if (!($i&1) && $i<14);
	ldq_u	@X[$i+2],($i+2)*4+0($inp)
	ldq_u	@X[$i+3],($i+2)*4+7($inp)
___
$code.=<<___ if (!($i&1) && $i<15);
	extql	@X[$i],$inp,@X[$i]
	extqh	@X[$i+1],$inp,@X[$i+1]

	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched

	srl	@X[$i],24,$t0		# vectorized byte swap
	srl	@X[$i],8,$t2

	sll	@X[$i],8,$t3
	sll	@X[$i],24,@X[$i]
	zapnot	$t0,0x11,$t0
	zapnot	$t2,0x22,$t2

	zapnot	@X[$i],0x88,@X[$i]
	or	$t0,$t2,$t0
	zapnot	$t3,0x44,$t3
	sll	$a,5,$t1

	or	@X[$i],$t0,@X[$i]
	addl	$K,$e,$e
	and	$b,$c,$t2
	zapnot	$a,0xf,$a

	or	@X[$i],$t3,@X[$i]
	srl	$a,27,$t0
	bic	$d,$b,$t3
	sll	$b,30,$b

	extll	@X[$i],4,@X[$i+1]	# extract upper half
	or	$t2,$t3,$t2
	addl	@X[$i],$e,$e

	addl	$t1,$e,$e
	srl	$b,32,$t3
	zapnot	@X[$i],0xf,@X[$i]

	addl	$t0,$e,$e
	addl	$t2,$e,$e
	or	$t3,$b,$b
___
$code.=<<___ if (($i&1) && $i<15);
	sll	$a,5,$t1
	addl	$K,$e,$e
	and	$b,$c,$t2
	zapnot	$a,0xf,$a

	srl	$a,27,$t0
	addl	@X[$i%16],$e,$e
	bic	$d,$b,$t3
	sll	$b,30,$b

	or	$t2,$t3,$t2
	addl	$t1,$e,$e
	srl	$b,32,$t3
	zapnot	@X[$i],0xf,@X[$i]

	addl	$t0,$e,$e
	addl	$t2,$e,$e
	or	$t3,$b,$b
___
$code.=<<___ if ($i>=15);	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	and	$b,$c,$t2
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	zapnot	$a,0xf,$a
	addl	@X[$i%16],$e,$e
	bic	$d,$b,$t3
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	srl	$a,27,$t0
	addl	$t1,$e,$e
	or	$t2,$t3,$t2
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$b
	addl	$t0,$e,$e
	srl	@X[$j%16],31,$t1

	addl	$t2,$e,$e
	srl	$b,32,$t3
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	zapnot	@X[$i%16],0xf,@X[$i%16]
	or	$t1,@X[$j%16],@X[$j%16]
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$t3
	addl	$t1,$e,$e
	xor	$b,$c,$t2
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	srl	$b,2,$b
	addl	@X[$i%16],$e,$e
	xor	$d,$t2,$t2
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	srl	@X[$j%16],31,$t1
	addl	$t2,$e,$e
	srl	$a,27,$t0
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	addl	$t0,$e,$e
	or	$t1,@X[$j%16],@X[$j%16]
___
$code.=<<___ if ($i<77);
	zapnot	@X[$i%16],0xf,@X[$i%16]
___
$code.=<<___ if ($i==79);	# with context fetch
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	ldl	@X[0],0($ctx)

	sll	$b,30,$t3
	addl	$t1,$e,$e
	xor	$b,$c,$t2
	ldl	@X[1],4($ctx)

	srl	$b,2,$b
	addl	@X[$i%16],$e,$e
	xor	$d,$t2,$t2
	ldl	@X[2],8($ctx)

	srl	$a,27,$t0
	addl	$t2,$e,$e
	ldl	@X[3],12($ctx)

	or	$t3,$b,$b
	addl	$t0,$e,$e
	ldl	@X[4],16($ctx)
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	srl	$a,27,$t0
	and	$b,$c,$t2
	and	$b,$d,$t3
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$b
	addl	$t1,$e,$e
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	srl	@X[$j%16],31,$t1
	addl	$t0,$e,$e
	or	$t2,$t3,$t2
	and	$c,$d,$t3

	or	$t2,$t3,$t2
	srl	$b,32,$t3
	addl	@X[$i%16],$e,$e
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	addl	$t2,$e,$e
	or	$t1,@X[$j%16],@X[$j%16]
	zapnot	@X[$i%16],0xf,@X[$i%16]
___
}

$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif

.text

.set	noat
.set	noreorder
.globl	sha1_block_data_order
.align	5
.ent	sha1_block_data_order
sha1_block_data_order:
	lda	sp,-64(sp)
	stq	ra,0(sp)
	stq	s0,8(sp)
	stq	s1,16(sp)
	stq	s2,24(sp)
	stq	s3,32(sp)
	stq	s4,40(sp)
	stq	s5,48(sp)
	stq	fp,56(sp)
	.mask	0x0400fe00,-64
	.frame	sp,64,ra
	.prologue 0

	ldl	$A,0($ctx)
	ldl	$B,4($ctx)
	sll	$num,6,$num
	ldl	$C,8($ctx)
	ldl	$D,12($ctx)
	ldl	$E,16($ctx)
	addq	$inp,$num,$num

.Lloop:
	.set	noreorder
	ldah	$K,23170(zero)
	zapnot	$B,0xf,$B
	lda	$K,31129($K)	# K_00_19
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,28378(zero)
	lda	$K,-5215($K)	# K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,-28900(zero)
	lda	$K,-17188($K)	# K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,-13725(zero)
	lda	$K,-15914($K)	# K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	addl	@X[0],$A,$A
	addl	@X[1],$B,$B
	addl	@X[2],$C,$C
	addl	@X[3],$D,$D
	addl	@X[4],$E,$E
	stl	$A,0($ctx)
	stl	$B,4($ctx)
	addq	$inp,64,$inp
	stl	$C,8($ctx)
	stl	$D,12($ctx)
	stl	$E,16($ctx)
	cmpult	$inp,$num,$t1
	bne	$t1,.Lloop

	.set	noreorder
	ldq	ra,0(sp)
	ldq	s0,8(sp)
	ldq	s1,16(sp)
	ldq	s2,24(sp)
	ldq	s3,32(sp)
	ldq	s4,40(sp)
	ldq	s5,48(sp)
	ldq	fp,56(sp)
	lda	sp,64(sp)
	ret	(ra)
.end	sha1_block_data_order
.ascii	"SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align	2
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;