Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # February 2009
     11 #
     12 # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
     13 # "cluster" Address Generation Interlocks, so that one pipeline stall
     14 # resolves several dependencies.
     15 
     16 # November 2010.
     17 #
     18 # Adapt for -m31 build. If kernel supports what's called "highgprs"
     19 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
     20 # instructions and achieve "64-bit" performance even in 31-bit legacy
     21 # application context. The feature is not specific to any particular
     22 # processor, as long as it's "z-CPU". Latter implies that the code
     23 # remains z/Architecture specific. On z990 it was measured to perform
     24 # 50% better than code generated by gcc 4.3.
     25 
     26 $flavour = shift;
     27 
     28 if ($flavour =~ /3[12]/) {
     29 	$SIZE_T=4;
     30 	$g="";
     31 } else {
     32 	$SIZE_T=8;
     33 	$g="g";
     34 }
     35 
     36 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     37 open STDOUT,">$output";
     38 
     39 $rp="%r14";
     40 $sp="%r15";
     41 $code=<<___;
     42 .text
     43 
     44 ___
     45 
     46 # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
     47 {
     48 $acc="%r0";
     49 $cnt="%r1";
     50 $key="%r2";
     51 $len="%r3";
     52 $inp="%r4";
     53 $out="%r5";
     54 
     55 @XX=("%r6","%r7");
     56 @TX=("%r8","%r9");
     57 $YY="%r10";
     58 $TY="%r11";
     59 
     60 $code.=<<___;
     61 .globl	RC4
     62 .type	RC4,\@function
     63 .align	64
     64 RC4:
     65 	stm${g}	%r6,%r11,6*$SIZE_T($sp)
     66 ___
     67 $code.=<<___ if ($flavour =~ /3[12]/);
     68 	llgfr	$len,$len
     69 ___
     70 $code.=<<___;
     71 	llgc	$XX[0],0($key)
     72 	llgc	$YY,1($key)
     73 	la	$XX[0],1($XX[0])
     74 	nill	$XX[0],0xff
     75 	srlg	$cnt,$len,3
     76 	ltgr	$cnt,$cnt
     77 	llgc	$TX[0],2($XX[0],$key)
     78 	jz	.Lshort
     79 	j	.Loop8
     80 
     81 .align	64
     82 .Loop8:
     83 ___
     84 for ($i=0;$i<8;$i++) {
     85 $code.=<<___;
     86 	la	$YY,0($YY,$TX[0])	# $i
     87 	nill	$YY,255
     88 	la	$XX[1],1($XX[0])
     89 	nill	$XX[1],255
     90 ___
     91 $code.=<<___ if ($i==1);
     92 	llgc	$acc,2($TY,$key)
     93 ___
     94 $code.=<<___ if ($i>1);
     95 	sllg	$acc,$acc,8
     96 	ic	$acc,2($TY,$key)
     97 ___
     98 $code.=<<___;
     99 	llgc	$TY,2($YY,$key)
    100 	stc	$TX[0],2($YY,$key)
    101 	llgc	$TX[1],2($XX[1],$key)
    102 	stc	$TY,2($XX[0],$key)
    103 	cr	$XX[1],$YY
    104 	jne	.Lcmov$i
    105 	la	$TX[1],0($TX[0])
    106 .Lcmov$i:
    107 	la	$TY,0($TY,$TX[0])
    108 	nill	$TY,255
    109 ___
    110 push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
    111 }
    112 
    113 $code.=<<___;
    114 	lg	$TX[1],0($inp)
    115 	sllg	$acc,$acc,8
    116 	la	$inp,8($inp)
    117 	ic	$acc,2($TY,$key)
    118 	xgr	$acc,$TX[1]
    119 	stg	$acc,0($out)
    120 	la	$out,8($out)
    121 	brctg	$cnt,.Loop8
    122 
    123 .Lshort:
    124 	lghi	$acc,7
    125 	ngr	$len,$acc
    126 	jz	.Lexit
    127 	j	.Loop1
    128 
    129 .align	16
    130 .Loop1:
    131 	la	$YY,0($YY,$TX[0])
    132 	nill	$YY,255
    133 	llgc	$TY,2($YY,$key)
    134 	stc	$TX[0],2($YY,$key)
    135 	stc	$TY,2($XX[0],$key)
    136 	ar	$TY,$TX[0]
    137 	ahi	$XX[0],1
    138 	nill	$TY,255
    139 	nill	$XX[0],255
    140 	llgc	$acc,0($inp)
    141 	la	$inp,1($inp)
    142 	llgc	$TY,2($TY,$key)
    143 	llgc	$TX[0],2($XX[0],$key)
    144 	xr	$acc,$TY
    145 	stc	$acc,0($out)
    146 	la	$out,1($out)
    147 	brct	$len,.Loop1
    148 
    149 .Lexit:
    150 	ahi	$XX[0],-1
    151 	stc	$XX[0],0($key)
    152 	stc	$YY,1($key)
    153 	lm${g}	%r6,%r11,6*$SIZE_T($sp)
    154 	br	$rp
    155 .size	RC4,.-RC4
    156 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
    157 
    158 ___
    159 }
    160 
    161 # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
    162 {
    163 $cnt="%r0";
    164 $idx="%r1";
    165 $key="%r2";
    166 $len="%r3";
    167 $inp="%r4";
    168 $acc="%r5";
    169 $dat="%r6";
    170 $ikey="%r7";
    171 $iinp="%r8";
    172 
    173 $code.=<<___;
    174 .globl	private_RC4_set_key
    175 .type	private_RC4_set_key,\@function
    176 .align	64
    177 private_RC4_set_key:
    178 	stm${g}	%r6,%r8,6*$SIZE_T($sp)
    179 	lhi	$cnt,256
    180 	la	$idx,0(%r0)
    181 	sth	$idx,0($key)
    182 .align	4
    183 .L1stloop:
    184 	stc	$idx,2($idx,$key)
    185 	la	$idx,1($idx)
    186 	brct	$cnt,.L1stloop
    187 
    188 	lghi	$ikey,-256
    189 	lr	$cnt,$len
    190 	la	$iinp,0(%r0)
    191 	la	$idx,0(%r0)
    192 .align	16
    193 .L2ndloop:
    194 	llgc	$acc,2+256($ikey,$key)
    195 	llgc	$dat,0($iinp,$inp)
    196 	la	$idx,0($idx,$acc)
    197 	la	$ikey,1($ikey)
    198 	la	$idx,0($idx,$dat)
    199 	nill	$idx,255
    200 	la	$iinp,1($iinp)
    201 	tml	$ikey,255
    202 	llgc	$dat,2($idx,$key)
    203 	stc	$dat,2+256-1($ikey,$key)
    204 	stc	$acc,2($idx,$key)
    205 	jz	.Ldone
    206 	brct	$cnt,.L2ndloop
    207 	lr	$cnt,$len
    208 	la	$iinp,0(%r0)
    209 	j	.L2ndloop
    210 .Ldone:
    211 	lm${g}	%r6,%r8,6*$SIZE_T($sp)
    212 	br	$rp
    213 .size	private_RC4_set_key,.-private_RC4_set_key
    214 
    215 ___
    216 }
    217 
    218 # const char *RC4_options()
    219 $code.=<<___;
    220 .globl	RC4_options
    221 .type	RC4_options,\@function
    222 .align	16
    223 RC4_options:
    224 	larl	%r2,.Loptions
    225 	br	%r14
    226 .size	RC4_options,.-RC4_options
    227 .section	.rodata
    228 .Loptions:
    229 .align	8
    230 .string	"rc4(8x,char)"
    231 ___
    232 
    233 print $code;
    234 close STDOUT;	# force flush
    235