1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # RC4 for PA-RISC. 11 12 # June 2009. 13 # 14 # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. 15 # For reference, [4x] unrolled loop is >40% faster than folded one. 16 # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement 17 # is believed to be not sufficient to justify the effort... 18 # 19 # Special thanks to polarhome.com for providing HP-UX account. 20 21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 22 23 $flavour = shift; 24 $output = shift; 25 open STDOUT,">$output"; 26 27 if ($flavour =~ /64/) { 28 $LEVEL ="2.0W"; 29 $SIZE_T =8; 30 $FRAME_MARKER =80; 31 $SAVED_RP =16; 32 $PUSH ="std"; 33 $PUSHMA ="std,ma"; 34 $POP ="ldd"; 35 $POPMB ="ldd,mb"; 36 } else { 37 $LEVEL ="1.0"; 38 $SIZE_T =4; 39 $FRAME_MARKER =48; 40 $SAVED_RP =20; 41 $PUSH ="stw"; 42 $PUSHMA ="stwm"; 43 $POP ="ldw"; 44 $POPMB ="ldwm"; 45 } 46 47 $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker 48 # [+ argument transfer] 49 $SZ=1; # defaults to RC4_CHAR 50 if (open CONF,"<${dir}../../opensslconf.h") { 51 while(<CONF>) { 52 if (m/#\s*define\s+RC4_INT\s+(.*)/) { 53 $SZ = ($1=~/char$/) ? 1 : 4; 54 last; 55 } 56 } 57 close CONF; 58 } 59 60 if ($SZ==1) { # RC4_CHAR 61 $LD="ldb"; 62 $LDX="ldbx"; 63 $MKX="addl"; 64 $ST="stb"; 65 } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) 66 $LD="ldw"; 67 $LDX="ldwx,s"; 68 $MKX="sh2addl"; 69 $ST="stw"; 70 } 71 72 $key="%r26"; 73 $len="%r25"; 74 $inp="%r24"; 75 $out="%r23"; 76 77 @XX=("%r19","%r20"); 78 @TX=("%r21","%r22"); 79 $YY="%r28"; 80 $TY="%r29"; 81 82 $acc="%r1"; 83 $ix="%r2"; 84 $iy="%r3"; 85 $dat0="%r4"; 86 $dat1="%r5"; 87 $rem="%r6"; 88 $mask="%r31"; 89 90 sub unrolledloopbody { 91 for ($i=0;$i<4;$i++) { 92 $code.=<<___; 93 ldo 1($XX[0]),$XX[1] 94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` 95 and $mask,$XX[1],$XX[1] 96 $LDX $YY($key),$TY 97 $MKX $YY,$key,$ix 98 $LDX $XX[1]($key),$TX[1] 99 $MKX $XX[0],$key,$iy 100 $ST $TX[0],0($ix) 101 comclr,<> $XX[1],$YY,%r0 ; conditional 102 copy $TX[0],$TX[1] ; move 103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` 104 $ST $TY,0($iy) 105 addl $TX[0],$TY,$TY 106 addl $TX[1],$YY,$YY 107 and $mask,$TY,$TY 108 and $mask,$YY,$YY 109 ___ 110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 111 } } 112 113 sub foldedloop { 114 my ($label,$count)=@_; 115 $code.=<<___; 116 $label 117 $MKX $YY,$key,$iy 118 $LDX $YY($key),$TY 119 $MKX $XX[0],$key,$ix 120 $ST $TX[0],0($iy) 121 ldo 1($XX[0]),$XX[0] 122 $ST $TY,0($ix) 123 addl $TX[0],$TY,$TY 124 ldbx $inp($out),$dat1 125 and $mask,$TY,$TY 126 and $mask,$XX[0],$XX[0] 127 $LDX $TY($key),$acc 128 $LDX $XX[0]($key),$TX[0] 129 ldo 1($out),$out 130 xor $dat1,$acc,$acc 131 addl $TX[0],$YY,$YY 132 stb $acc,-1($out) 133 addib,<> -1,$count,$label ; $count is always small 134 and $mask,$YY,$YY 135 ___ 136 } 137 138 $code=<<___; 139 .LEVEL $LEVEL 140 .SPACE \$TEXT\$ 141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 142 143 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 144 RC4 145 .PROC 146 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 147 .ENTRY 148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 149 $PUSHMA %r3,$FRAME(%sp) 150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 153 154 cmpib,*= 0,$len,L\$abort 155 sub $inp,$out,$inp ; distance between $inp and $out 156 157 $LD `0*$SZ`($key),$XX[0] 158 $LD `1*$SZ`($key),$YY 159 ldo `2*$SZ`($key),$key 160 161 ldi 0xff,$mask 162 ldi 3,$dat0 163 164 ldo 1($XX[0]),$XX[0] ; warm up loop 165 and $mask,$XX[0],$XX[0] 166 $LDX $XX[0]($key),$TX[0] 167 addl $TX[0],$YY,$YY 168 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? 169 and $mask,$YY,$YY 170 171 and,<> $out,$dat0,$rem ; is $out aligned? 172 b L\$alignedout 173 subi 4,$rem,$rem 174 sub $len,$rem,$len 175 ___ 176 &foldedloop("L\$alignout",$rem); # process till $out is aligned 177 178 $code.=<<___; 179 L\$alignedout ; $len is at least 4 here 180 and,<> $inp,$dat0,$acc ; is $inp aligned? 181 b L\$oop4 182 sub $inp,$acc,$rem ; align $inp 183 184 sh3addl $acc,%r0,$acc 185 subi 32,$acc,$acc 186 mtctl $acc,%cr11 ; load %sar with vshd align factor 187 ldwx $rem($out),$dat0 188 ldo 4($rem),$rem 189 L\$oop4misalignedinp 190 ___ 191 &unrolledloopbody(); 192 $code.=<<___; 193 $LDX $TY($key),$ix 194 ldwx $rem($out),$dat1 195 ldo -4($len),$len 196 or $ix,$acc,$acc ; last piece, no need to dep 197 vshd $dat0,$dat1,$iy ; align data 198 copy $dat1,$dat0 199 xor $iy,$acc,$acc 200 stw $acc,0($out) 201 cmpib,*<< 3,$len,L\$oop4misalignedinp 202 ldo 4($out),$out 203 cmpib,*= 0,$len,L\$done 204 nop 205 b L\$oop1 206 nop 207 208 .ALIGN 8 209 L\$oop4 210 ___ 211 &unrolledloopbody(); 212 $code.=<<___; 213 $LDX $TY($key),$ix 214 ldwx $inp($out),$dat0 215 ldo -4($len),$len 216 or $ix,$acc,$acc ; last piece, no need to dep 217 xor $dat0,$acc,$acc 218 stw $acc,0($out) 219 cmpib,*<< 3,$len,L\$oop4 220 ldo 4($out),$out 221 cmpib,*= 0,$len,L\$done 222 nop 223 ___ 224 &foldedloop("L\$oop1",$len); 225 $code.=<<___; 226 L\$done 227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 228 ldo -1($XX[0]),$XX[0] ; chill out loop 229 sub $YY,$TX[0],$YY 230 and $mask,$XX[0],$XX[0] 231 and $mask,$YY,$YY 232 $ST $XX[0],`-2*$SZ`($key) 233 $ST $YY,`-1*$SZ`($key) 234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 237 L\$abort 238 bv (%r2) 239 .EXIT 240 $POPMB -$FRAME(%sp),%r3 241 .PROCEND 242 ___ 243 244 $code.=<<___; 245 246 .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR 247 .ALIGN 8 248 private_RC4_set_key 249 .PROC 250 .CALLINFO NO_CALLS 251 .ENTRY 252 $ST %r0,`0*$SZ`($key) 253 $ST %r0,`1*$SZ`($key) 254 ldo `2*$SZ`($key),$key 255 copy %r0,@XX[0] 256 L\$1st 257 $ST @XX[0],0($key) 258 ldo 1(@XX[0]),@XX[0] 259 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 260 ldo $SZ($key),$key 261 262 ldo `-256*$SZ`($key),$key ; rewind $key 263 addl $len,$inp,$inp ; $inp to point at the end 264 sub %r0,$len,%r23 ; inverse index 265 copy %r0,@XX[0] 266 copy %r0,@XX[1] 267 ldi 0xff,$mask 268 269 L\$2nd 270 $LDX @XX[0]($key),@TX[0] 271 ldbx %r23($inp),@TX[1] 272 addi,nuv 1,%r23,%r23 ; increment and conditional 273 sub %r0,$len,%r23 ; inverse index 274 addl @TX[0],@XX[1],@XX[1] 275 addl @TX[1],@XX[1],@XX[1] 276 and $mask,@XX[1],@XX[1] 277 $MKX @XX[0],$key,$TY 278 $LDX @XX[1]($key),@TX[1] 279 $MKX @XX[1],$key,$YY 280 ldo 1(@XX[0]),@XX[0] 281 $ST @TX[0],0($YY) 282 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 283 $ST @TX[1],0($TY) 284 285 bv,n (%r2) 286 .EXIT 287 nop 288 .PROCEND 289 290 .EXPORT RC4_options,ENTRY 291 .ALIGN 8 292 RC4_options 293 .PROC 294 .CALLINFO NO_CALLS 295 .ENTRY 296 blr %r0,%r28 297 ldi 3,%r1 298 L\$pic 299 andcm %r28,%r1,%r28 300 bv (%r2) 301 .EXIT 302 ldo L\$opts-L\$pic(%r28),%r28 303 .PROCEND 304 .ALIGN 8 305 L\$opts 306 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" 307 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 308 ___ 309 $code =~ s/\`([^\`]*)\`/eval $1/gem; 310 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); 311 312 print $code; 313 close STDOUT; 314