Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA1 block procedure for s390x.
     11 
     12 # April 2007.
     13 #
     14 # Performance is >30% better than gcc 3.3 generated code. But the real
     15 # twist is that SHA1 hardware support is detected and utilized. In
     16 # which case performance can reach further >4.5x for larger chunks.
     17 
     18 # January 2009.
     19 #
     20 # Optimize Xupdate for amount of memory references and reschedule
     21 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
     22 # "only" ~2.3x faster than software.
     23 
     24 # November 2010.
     25 #
     26 # Adapt for -m31 build. If kernel supports what's called "highgprs"
     27 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
     28 # instructions and achieve "64-bit" performance even in 31-bit legacy
     29 # application context. The feature is not specific to any particular
     30 # processor, as long as it's "z-CPU". Latter implies that the code
     31 # remains z/Architecture specific.
     32 
     33 $kimdfunc=1;	# magic function code for kimd instruction
     34 
     35 $flavour = shift;
     36 
     37 if ($flavour =~ /3[12]/) {
     38 	$SIZE_T=4;
     39 	$g="";
     40 } else {
     41 	$SIZE_T=8;
     42 	$g="g";
     43 }
     44 
     45 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     46 open STDOUT,">$output";
     47 
     48 $K_00_39="%r0"; $K=$K_00_39;
     49 $K_40_79="%r1";
     50 $ctx="%r2";	$prefetch="%r2";
     51 $inp="%r3";
     52 $len="%r4";
     53 
     54 $A="%r5";
     55 $B="%r6";
     56 $C="%r7";
     57 $D="%r8";
     58 $E="%r9";	@V=($A,$B,$C,$D,$E);
     59 $t0="%r10";
     60 $t1="%r11";
     61 @X=("%r12","%r13","%r14");
     62 $sp="%r15";
     63 
     64 $stdframe=16*$SIZE_T+4*8;
     65 $frame=$stdframe+16*4;
     66 
     67 sub Xupdate {
     68 my $i=shift;
     69 
     70 $code.=<<___ if ($i==15);
     71 	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
     72 	lr	$X[0],$X[2]
     73 ___
     74 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
     75 $code.=<<___ if ($i<16);
     76 	lg	$X[0],`$i*4`($inp)	### Xload($i)
     77 	rllg	$X[1],$X[0],32
     78 ___
     79 $code.=<<___ if ($i>=16);
     80 	xgr	$X[0],$prefetch		### Xupdate($i)
     81 	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
     82 	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
     83 	xgr	$X[0],$prefetch
     84 	rll	$X[0],$X[0],1
     85 	rllg	$X[1],$X[0],32
     86 	rll	$X[1],$X[1],1
     87 	rllg	$X[0],$X[1],32
     88 	lr	$X[2],$X[1]		# feedback
     89 ___
     90 $code.=<<___ if ($i<=70);
     91 	stg	$X[0],`$stdframe+4*($i%16)`($sp)
     92 ___
     93 unshift(@X,pop(@X));
     94 }
     95 
     96 sub BODY_00_19 {
     97 my ($i,$a,$b,$c,$d,$e)=@_;
     98 my $xi=$X[1];
     99 
    100 	&Xupdate($i);
    101 $code.=<<___;
    102 	alr	$e,$K		### $i
    103 	rll	$t1,$a,5
    104 	lr	$t0,$d
    105 	xr	$t0,$c
    106 	alr	$e,$t1
    107 	nr	$t0,$b
    108 	alr	$e,$xi
    109 	xr	$t0,$d
    110 	rll	$b,$b,30
    111 	alr	$e,$t0
    112 ___
    113 }
    114 
    115 sub BODY_20_39 {
    116 my ($i,$a,$b,$c,$d,$e)=@_;
    117 my $xi=$X[1];
    118 
    119 	&Xupdate($i);
    120 $code.=<<___;
    121 	alr	$e,$K		### $i
    122 	rll	$t1,$a,5
    123 	lr	$t0,$b
    124 	alr	$e,$t1
    125 	xr	$t0,$c
    126 	alr	$e,$xi
    127 	xr	$t0,$d
    128 	rll	$b,$b,30
    129 	alr	$e,$t0
    130 ___
    131 }
    132 
    133 sub BODY_40_59 {
    134 my ($i,$a,$b,$c,$d,$e)=@_;
    135 my $xi=$X[1];
    136 
    137 	&Xupdate($i);
    138 $code.=<<___;
    139 	alr	$e,$K		### $i
    140 	rll	$t1,$a,5
    141 	lr	$t0,$b
    142 	alr	$e,$t1
    143 	or	$t0,$c
    144 	lr	$t1,$b
    145 	nr	$t0,$d
    146 	nr	$t1,$c
    147 	alr	$e,$xi
    148 	or	$t0,$t1
    149 	rll	$b,$b,30
    150 	alr	$e,$t0
    151 ___
    152 }
    153 
    154 $code.=<<___;
    155 .text
    156 .align	64
    157 .type	Ktable,\@object
    158 Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
    159 	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
    160 .size	Ktable,.-Ktable
    161 .globl	sha1_block_data_order
    162 .type	sha1_block_data_order,\@function
    163 sha1_block_data_order:
    164 ___
    165 $code.=<<___ if ($kimdfunc);
    166 	larl	%r1,OPENSSL_s390xcap_P
    167 	lg	%r0,0(%r1)
    168 	tmhl	%r0,0x4000	# check for message-security assist
    169 	jz	.Lsoftware
    170 	lghi	%r0,0
    171 	la	%r1,`2*$SIZE_T`($sp)
    172 	.long	0xb93e0002	# kimd %r0,%r2
    173 	lg	%r0,`2*$SIZE_T`($sp)
    174 	tmhh	%r0,`0x8000>>$kimdfunc`
    175 	jz	.Lsoftware
    176 	lghi	%r0,$kimdfunc
    177 	lgr	%r1,$ctx
    178 	lgr	%r2,$inp
    179 	sllg	%r3,$len,6
    180 	.long	0xb93e0002	# kimd %r0,%r2
    181 	brc	1,.-4		# pay attention to "partial completion"
    182 	br	%r14
    183 .align	16
    184 .Lsoftware:
    185 ___
    186 $code.=<<___;
    187 	lghi	%r1,-$frame
    188 	st${g}	$ctx,`2*$SIZE_T`($sp)
    189 	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
    190 	lgr	%r0,$sp
    191 	la	$sp,0(%r1,$sp)
    192 	st${g}	%r0,0($sp)
    193 
    194 	larl	$t0,Ktable
    195 	llgf	$A,0($ctx)
    196 	llgf	$B,4($ctx)
    197 	llgf	$C,8($ctx)
    198 	llgf	$D,12($ctx)
    199 	llgf	$E,16($ctx)
    200 
    201 	lg	$K_00_39,0($t0)
    202 	lg	$K_40_79,8($t0)
    203 
    204 .Lloop:
    205 	rllg	$K_00_39,$K_00_39,32
    206 ___
    207 for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    208 $code.=<<___;
    209 	rllg	$K_00_39,$K_00_39,32
    210 ___
    211 for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    212 $code.=<<___;	$K=$K_40_79;
    213 	rllg	$K_40_79,$K_40_79,32
    214 ___
    215 for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    216 $code.=<<___;
    217 	rllg	$K_40_79,$K_40_79,32
    218 ___
    219 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    220 $code.=<<___;
    221 
    222 	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
    223 	la	$inp,64($inp)
    224 	al	$A,0($ctx)
    225 	al	$B,4($ctx)
    226 	al	$C,8($ctx)
    227 	al	$D,12($ctx)
    228 	al	$E,16($ctx)
    229 	st	$A,0($ctx)
    230 	st	$B,4($ctx)
    231 	st	$C,8($ctx)
    232 	st	$D,12($ctx)
    233 	st	$E,16($ctx)
    234 	brct${g} $len,.Lloop
    235 
    236 	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
    237 	br	%r14
    238 .size	sha1_block_data_order,.-sha1_block_data_order
    239 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
    240 .comm	OPENSSL_s390xcap_P,16,8
    241 ___
    242 
    243 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    244 
    245 print $code;
    246 close STDOUT;
    247