1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # February 2009 11 # 12 # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to 13 # "cluster" Address Generation Interlocks, so that one pipeline stall 14 # resolves several dependencies. 15 16 # November 2010. 17 # 18 # Adapt for -m31 build. If kernel supports what's called "highgprs" 19 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 20 # instructions and achieve "64-bit" performance even in 31-bit legacy 21 # application context. The feature is not specific to any particular 22 # processor, as long as it's "z-CPU". Latter implies that the code 23 # remains z/Architecture specific. On z990 it was measured to perform 24 # 50% better than code generated by gcc 4.3. 25 26 $flavour = shift; 27 28 if ($flavour =~ /3[12]/) { 29 $SIZE_T=4; 30 $g=""; 31 } else { 32 $SIZE_T=8; 33 $g="g"; 34 } 35 36 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 37 open STDOUT,">$output"; 38 39 $rp="%r14"; 40 $sp="%r15"; 41 $code=<<___; 42 .text 43 44 ___ 45 46 # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) 47 { 48 $acc="%r0"; 49 $cnt="%r1"; 50 $key="%r2"; 51 $len="%r3"; 52 $inp="%r4"; 53 $out="%r5"; 54 55 @XX=("%r6","%r7"); 56 @TX=("%r8","%r9"); 57 $YY="%r10"; 58 $TY="%r11"; 59 60 $code.=<<___; 61 .globl RC4 62 .type RC4,\@function 63 .align 64 64 RC4: 65 stm${g} %r6,%r11,6*$SIZE_T($sp) 66 ___ 67 $code.=<<___ if ($flavour =~ /3[12]/); 68 llgfr $len,$len 69 ___ 70 $code.=<<___; 71 llgc $XX[0],0($key) 72 llgc $YY,1($key) 73 la $XX[0],1($XX[0]) 74 nill $XX[0],0xff 75 srlg $cnt,$len,3 76 ltgr $cnt,$cnt 77 llgc $TX[0],2($XX[0],$key) 78 jz .Lshort 79 j .Loop8 80 81 .align 64 82 .Loop8: 83 ___ 84 for ($i=0;$i<8;$i++) { 85 $code.=<<___; 86 la $YY,0($YY,$TX[0]) # $i 87 nill $YY,255 88 la $XX[1],1($XX[0]) 89 nill $XX[1],255 90 ___ 91 $code.=<<___ if ($i==1); 92 llgc $acc,2($TY,$key) 93 ___ 94 $code.=<<___ if ($i>1); 95 sllg $acc,$acc,8 96 ic $acc,2($TY,$key) 97 ___ 98 $code.=<<___; 99 llgc $TY,2($YY,$key) 100 stc $TX[0],2($YY,$key) 101 llgc $TX[1],2($XX[1],$key) 102 stc $TY,2($XX[0],$key) 103 cr $XX[1],$YY 104 jne .Lcmov$i 105 la $TX[1],0($TX[0]) 106 .Lcmov$i: 107 la $TY,0($TY,$TX[0]) 108 nill $TY,255 109 ___ 110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 111 } 112 113 $code.=<<___; 114 lg $TX[1],0($inp) 115 sllg $acc,$acc,8 116 la $inp,8($inp) 117 ic $acc,2($TY,$key) 118 xgr $acc,$TX[1] 119 stg $acc,0($out) 120 la $out,8($out) 121 brctg $cnt,.Loop8 122 123 .Lshort: 124 lghi $acc,7 125 ngr $len,$acc 126 jz .Lexit 127 j .Loop1 128 129 .align 16 130 .Loop1: 131 la $YY,0($YY,$TX[0]) 132 nill $YY,255 133 llgc $TY,2($YY,$key) 134 stc $TX[0],2($YY,$key) 135 stc $TY,2($XX[0],$key) 136 ar $TY,$TX[0] 137 ahi $XX[0],1 138 nill $TY,255 139 nill $XX[0],255 140 llgc $acc,0($inp) 141 la $inp,1($inp) 142 llgc $TY,2($TY,$key) 143 llgc $TX[0],2($XX[0],$key) 144 xr $acc,$TY 145 stc $acc,0($out) 146 la $out,1($out) 147 brct $len,.Loop1 148 149 .Lexit: 150 ahi $XX[0],-1 151 stc $XX[0],0($key) 152 stc $YY,1($key) 153 lm${g} %r6,%r11,6*$SIZE_T($sp) 154 br $rp 155 .size RC4,.-RC4 156 .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" 157 158 ___ 159 } 160 161 # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) 162 { 163 $cnt="%r0"; 164 $idx="%r1"; 165 $key="%r2"; 166 $len="%r3"; 167 $inp="%r4"; 168 $acc="%r5"; 169 $dat="%r6"; 170 $ikey="%r7"; 171 $iinp="%r8"; 172 173 $code.=<<___; 174 .globl private_RC4_set_key 175 .type private_RC4_set_key,\@function 176 .align 64 177 private_RC4_set_key: 178 stm${g} %r6,%r8,6*$SIZE_T($sp) 179 lhi $cnt,256 180 la $idx,0(%r0) 181 sth $idx,0($key) 182 .align 4 183 .L1stloop: 184 stc $idx,2($idx,$key) 185 la $idx,1($idx) 186 brct $cnt,.L1stloop 187 188 lghi $ikey,-256 189 lr $cnt,$len 190 la $iinp,0(%r0) 191 la $idx,0(%r0) 192 .align 16 193 .L2ndloop: 194 llgc $acc,2+256($ikey,$key) 195 llgc $dat,0($iinp,$inp) 196 la $idx,0($idx,$acc) 197 la $ikey,1($ikey) 198 la $idx,0($idx,$dat) 199 nill $idx,255 200 la $iinp,1($iinp) 201 tml $ikey,255 202 llgc $dat,2($idx,$key) 203 stc $dat,2+256-1($ikey,$key) 204 stc $acc,2($idx,$key) 205 jz .Ldone 206 brct $cnt,.L2ndloop 207 lr $cnt,$len 208 la $iinp,0(%r0) 209 j .L2ndloop 210 .Ldone: 211 lm${g} %r6,%r8,6*$SIZE_T($sp) 212 br $rp 213 .size private_RC4_set_key,.-private_RC4_set_key 214 215 ___ 216 } 217 218 # const char *RC4_options() 219 $code.=<<___; 220 .globl RC4_options 221 .type RC4_options,\@function 222 .align 16 223 RC4_options: 224 larl %r2,.Loptions 225 br %r14 226 .size RC4_options,.-RC4_options 227 .section .rodata 228 .Loptions: 229 .align 8 230 .string "rc4(8x,char)" 231 ___ 232 233 print $code; 234 close STDOUT; # force flush 235