Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # Performance improvement is not really impressive on pre-T1 CPU: +8%
     11 # over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
     12 # turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
     13 # >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
     14 # X[16] vector is packed to 8 64-bit registers and as result nothing
     15 # is spilled on stack. In addition input data is loaded in compact
     16 # instruction sequence, thus minimizing the window when the code is
     17 # subject to [inter-thread] cache-thrashing hazard. The goal is to
     18 # ensure scalability on UltraSPARC T1, or rather to avoid decay when
     19 # amount of active threads exceeds the number of physical cores.
     20 
     21 $bits=32;
     22 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     23 if ($bits==64)	{ $bias=2047; $frame=192; }
     24 else		{ $bias=0;    $frame=112; }
     25 
     26 $output=shift;
     27 open STDOUT,">$output";
     28 
     29 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
     30 $rot1m="%g2";
     31 $tmp64="%g3";
     32 $Xi="%g4";
     33 $A="%l0";
     34 $B="%l1";
     35 $C="%l2";
     36 $D="%l3";
     37 $E="%l4";
     38 @V=($A,$B,$C,$D,$E);
     39 $K_00_19="%l5";
     40 $K_20_39="%l6";
     41 $K_40_59="%l7";
     42 $K_60_79="%g5";
     43 @K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
     44 
     45 $ctx="%i0";
     46 $inp="%i1";
     47 $len="%i2";
     48 $tmp0="%i3";
     49 $tmp1="%i4";
     50 $tmp2="%i5";
     51 
     52 sub BODY_00_15 {
     53 my ($i,$a,$b,$c,$d,$e)=@_;
     54 my $xi=($i&1)?@X[($i/2)%8]:$Xi;
     55 
     56 $code.=<<___;
     57 	sll	$a,5,$tmp0		!! $i
     58 	add	@K[$i/20],$e,$e
     59 	srl	$a,27,$tmp1
     60 	add	$tmp0,$e,$e
     61 	and	$c,$b,$tmp0
     62 	add	$tmp1,$e,$e
     63 	sll	$b,30,$tmp2
     64 	andn	$d,$b,$tmp1
     65 	srl	$b,2,$b
     66 	or	$tmp1,$tmp0,$tmp1
     67 	or	$tmp2,$b,$b
     68 	add	$xi,$e,$e
     69 ___
     70 if ($i&1 && $i<15) {
     71 	$code.=
     72 	"	srlx	@X[(($i+1)/2)%8],32,$Xi\n";
     73 }
     74 $code.=<<___;
     75 	add	$tmp1,$e,$e
     76 ___
     77 }
     78 
     79 sub Xupdate {
     80 my ($i,$a,$b,$c,$d,$e)=@_;
     81 my $j=$i/2;
     82 
     83 if ($i&1) {
     84 $code.=<<___;
     85 	sll	$a,5,$tmp0		!! $i
     86 	add	@K[$i/20],$e,$e
     87 	srl	$a,27,$tmp1
     88 ___
     89 } else {
     90 $code.=<<___;
     91 	sllx	@X[($j+6)%8],32,$Xi	! Xupdate($i)
     92 	xor	@X[($j+1)%8],@X[$j%8],@X[$j%8]
     93 	srlx	@X[($j+7)%8],32,$tmp1
     94 	xor	@X[($j+4)%8],@X[$j%8],@X[$j%8]
     95 	sll	$a,5,$tmp0		!! $i
     96 	or	$tmp1,$Xi,$Xi
     97 	add	@K[$i/20],$e,$e		!!
     98 	xor	$Xi,@X[$j%8],@X[$j%8]
     99 	srlx	@X[$j%8],31,$Xi
    100 	add	@X[$j%8],@X[$j%8],@X[$j%8]
    101 	and	$Xi,$rot1m,$Xi
    102 	andn	@X[$j%8],$rot1m,@X[$j%8]
    103 	srl	$a,27,$tmp1		!!
    104 	or	$Xi,@X[$j%8],@X[$j%8]
    105 ___
    106 }
    107 }
    108 
    109 sub BODY_16_19 {
    110 my ($i,$a,$b,$c,$d,$e)=@_;
    111 
    112 	&Xupdate(@_);
    113     if ($i&1) {
    114 	$xi=@X[($i/2)%8];
    115     } else {
    116 	$xi=$Xi;
    117 	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
    118     }
    119 $code.=<<___;
    120 	add	$tmp0,$e,$e		!!
    121 	and	$c,$b,$tmp0
    122 	add	$tmp1,$e,$e
    123 	sll	$b,30,$tmp2
    124 	add	$xi,$e,$e
    125 	andn	$d,$b,$tmp1
    126 	srl	$b,2,$b
    127 	or	$tmp1,$tmp0,$tmp1
    128 	or	$tmp2,$b,$b
    129 	add	$tmp1,$e,$e
    130 ___
    131 }
    132 
    133 sub BODY_20_39 {
    134 my ($i,$a,$b,$c,$d,$e)=@_;
    135 my $xi;
    136 	&Xupdate(@_);
    137     if ($i&1) {
    138 	$xi=@X[($i/2)%8];
    139     } else {
    140 	$xi=$Xi;
    141 	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
    142     }
    143 $code.=<<___;
    144 	add	$tmp0,$e,$e		!!
    145 	xor	$c,$b,$tmp0
    146 	add	$tmp1,$e,$e
    147 	sll	$b,30,$tmp2
    148 	xor	$d,$tmp0,$tmp1
    149 	srl	$b,2,$b
    150 	add	$tmp1,$e,$e
    151 	or	$tmp2,$b,$b
    152 	add	$xi,$e,$e
    153 ___
    154 }
    155 
    156 sub BODY_40_59 {
    157 my ($i,$a,$b,$c,$d,$e)=@_;
    158 my $xi;
    159 	&Xupdate(@_);
    160     if ($i&1) {
    161 	$xi=@X[($i/2)%8];
    162     } else {
    163 	$xi=$Xi;
    164 	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
    165     }
    166 $code.=<<___;
    167 	add	$tmp0,$e,$e		!!
    168 	and	$c,$b,$tmp0
    169 	add	$tmp1,$e,$e
    170 	sll	$b,30,$tmp2
    171 	or	$c,$b,$tmp1
    172 	srl	$b,2,$b
    173 	and	$d,$tmp1,$tmp1
    174 	add	$xi,$e,$e
    175 	or	$tmp1,$tmp0,$tmp1
    176 	or	$tmp2,$b,$b
    177 	add	$tmp1,$e,$e
    178 ___
    179 }
    180 
    181 $code.=<<___ if ($bits==64);
    182 .register	%g2,#scratch
    183 .register	%g3,#scratch
    184 ___
    185 $code.=<<___;
    186 .section	".text",#alloc,#execinstr
    187 
    188 .align	32
    189 .globl	sha1_block_data_order
    190 sha1_block_data_order:
    191 	save	%sp,-$frame,%sp
    192 	sllx	$len,6,$len
    193 	add	$inp,$len,$len
    194 
    195 	or	%g0,1,$rot1m
    196 	sllx	$rot1m,32,$rot1m
    197 	or	$rot1m,1,$rot1m
    198 
    199 	ld	[$ctx+0],$A
    200 	ld	[$ctx+4],$B
    201 	ld	[$ctx+8],$C
    202 	ld	[$ctx+12],$D
    203 	ld	[$ctx+16],$E
    204 	andn	$inp,7,$tmp0
    205 
    206 	sethi	%hi(0x5a827999),$K_00_19
    207 	or	$K_00_19,%lo(0x5a827999),$K_00_19
    208 	sethi	%hi(0x6ed9eba1),$K_20_39
    209 	or	$K_20_39,%lo(0x6ed9eba1),$K_20_39
    210 	sethi	%hi(0x8f1bbcdc),$K_40_59
    211 	or	$K_40_59,%lo(0x8f1bbcdc),$K_40_59
    212 	sethi	%hi(0xca62c1d6),$K_60_79
    213 	or	$K_60_79,%lo(0xca62c1d6),$K_60_79
    214 
    215 .Lloop:
    216 	ldx	[$tmp0+0],@X[0]
    217 	ldx	[$tmp0+16],@X[2]
    218 	ldx	[$tmp0+32],@X[4]
    219 	ldx	[$tmp0+48],@X[6]
    220 	and	$inp,7,$tmp1
    221 	ldx	[$tmp0+8],@X[1]
    222 	sll	$tmp1,3,$tmp1
    223 	ldx	[$tmp0+24],@X[3]
    224 	subcc	%g0,$tmp1,$tmp2	! should be 64-$tmp1, but -$tmp1 works too
    225 	ldx	[$tmp0+40],@X[5]
    226 	bz,pt	%icc,.Laligned
    227 	ldx	[$tmp0+56],@X[7]
    228 
    229 	sllx	@X[0],$tmp1,@X[0]
    230 	ldx	[$tmp0+64],$tmp64
    231 ___
    232 for($i=0;$i<7;$i++)
    233 {   $code.=<<___;
    234 	srlx	@X[$i+1],$tmp2,$Xi
    235 	sllx	@X[$i+1],$tmp1,@X[$i+1]
    236 	or	$Xi,@X[$i],@X[$i]
    237 ___
    238 }
    239 $code.=<<___;
    240 	srlx	$tmp64,$tmp2,$tmp64
    241 	or	$tmp64,@X[7],@X[7]
    242 .Laligned:
    243 	srlx	@X[0],32,$Xi
    244 ___
    245 for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    246 for (;$i<20;$i++)	{ &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
    247 for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    248 for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    249 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    250 $code.=<<___;
    251 
    252 	ld	[$ctx+0],@X[0]
    253 	ld	[$ctx+4],@X[1]
    254 	ld	[$ctx+8],@X[2]
    255 	ld	[$ctx+12],@X[3]
    256 	add	$inp,64,$inp
    257 	ld	[$ctx+16],@X[4]
    258 	cmp	$inp,$len
    259 
    260 	add	$A,@X[0],$A
    261 	st	$A,[$ctx+0]
    262 	add	$B,@X[1],$B
    263 	st	$B,[$ctx+4]
    264 	add	$C,@X[2],$C
    265 	st	$C,[$ctx+8]
    266 	add	$D,@X[3],$D
    267 	st	$D,[$ctx+12]
    268 	add	$E,@X[4],$E
    269 	st	$E,[$ctx+16]
    270 
    271 	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
    272 	andn	$inp,7,$tmp0
    273 
    274 	ret
    275 	restore
    276 .type	sha1_block_data_order,#function
    277 .size	sha1_block_data_order,(.-sha1_block_data_order)
    278 .asciz	"SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
    279 .align	4
    280 ___
    281 
    282 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    283 print $code;
    284 close STDOUT;
    285