Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # RC4 for PA-RISC.
     11 
     12 # June 2009.
     13 #
     14 # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
     15 # For reference, [4x] unrolled loop is >40% faster than folded one.
     16 # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
     17 # is believed to be not sufficient to justify the effort...
     18 #
     19 # Special thanks to polarhome.com for providing HP-UX account.
     20 
     21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     22 
     23 $flavour = shift;
     24 $output = shift;
     25 open STDOUT,">$output";
     26 
     27 if ($flavour =~ /64/) {
     28 	$LEVEL		="2.0W";
     29 	$SIZE_T		=8;
     30 	$FRAME_MARKER	=80;
     31 	$SAVED_RP	=16;
     32 	$PUSH		="std";
     33 	$PUSHMA		="std,ma";
     34 	$POP		="ldd";
     35 	$POPMB		="ldd,mb";
     36 } else {
     37 	$LEVEL		="1.0";
     38 	$SIZE_T		=4;
     39 	$FRAME_MARKER	=48;
     40 	$SAVED_RP	=20;
     41 	$PUSH		="stw";
     42 	$PUSHMA		="stwm";
     43 	$POP		="ldw";
     44 	$POPMB		="ldwm";
     45 }
     46 
     47 $FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
     48 				#                [+ argument transfer]
     49 $SZ=1;				# defaults to RC4_CHAR
     50 if (open CONF,"<${dir}../../opensslconf.h") {
     51     while(<CONF>) {
     52 	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
     53 	    $SZ = ($1=~/char$/) ? 1 : 4;
     54 	    last;
     55 	}
     56     }
     57     close CONF;
     58 }
     59 
     60 if ($SZ==1) {	# RC4_CHAR
     61     $LD="ldb";
     62     $LDX="ldbx";
     63     $MKX="addl";
     64     $ST="stb";
     65 } else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
     66     $LD="ldw";
     67     $LDX="ldwx,s";
     68     $MKX="sh2addl";
     69     $ST="stw";
     70 }
     71 
     72 $key="%r26";
     73 $len="%r25";
     74 $inp="%r24";
     75 $out="%r23";
     76 
     77 @XX=("%r19","%r20");
     78 @TX=("%r21","%r22");
     79 $YY="%r28";
     80 $TY="%r29";
     81 
     82 $acc="%r1";
     83 $ix="%r2";
     84 $iy="%r3";
     85 $dat0="%r4";
     86 $dat1="%r5";
     87 $rem="%r6";
     88 $mask="%r31";
     89 
     90 sub unrolledloopbody {
     91 for ($i=0;$i<4;$i++) {
     92 $code.=<<___;
     93 	ldo	1($XX[0]),$XX[1]
     94 	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`	
     95 	and	$mask,$XX[1],$XX[1]
     96 	$LDX	$YY($key),$TY
     97 	$MKX	$YY,$key,$ix
     98 	$LDX	$XX[1]($key),$TX[1]
     99 	$MKX	$XX[0],$key,$iy
    100 	$ST	$TX[0],0($ix)
    101 	comclr,<> $XX[1],$YY,%r0	; conditional
    102 	copy	$TX[0],$TX[1]		; move
    103 	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
    104 	$ST	$TY,0($iy)
    105 	addl	$TX[0],$TY,$TY
    106 	addl	$TX[1],$YY,$YY
    107 	and	$mask,$TY,$TY
    108 	and	$mask,$YY,$YY
    109 ___
    110 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
    111 } }
    112 
    113 sub foldedloop {
    114 my ($label,$count)=@_;
    115 $code.=<<___;
    116 $label
    117 	$MKX	$YY,$key,$iy
    118 	$LDX	$YY($key),$TY
    119 	$MKX	$XX[0],$key,$ix
    120 	$ST	$TX[0],0($iy)
    121 	ldo	1($XX[0]),$XX[0]
    122 	$ST	$TY,0($ix)
    123 	addl	$TX[0],$TY,$TY
    124 	ldbx	$inp($out),$dat1
    125 	and	$mask,$TY,$TY
    126 	and	$mask,$XX[0],$XX[0]
    127 	$LDX	$TY($key),$acc
    128 	$LDX	$XX[0]($key),$TX[0]
    129 	ldo	1($out),$out
    130 	xor	$dat1,$acc,$acc
    131 	addl	$TX[0],$YY,$YY
    132 	stb	$acc,-1($out)
    133 	addib,<> -1,$count,$label	; $count is always small
    134 	and	$mask,$YY,$YY
    135 ___
    136 }
    137 
    138 $code=<<___;
    139 	.LEVEL	$LEVEL
    140 	.SPACE	\$TEXT\$
    141 	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
    142 
    143 	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
    144 RC4
    145 	.PROC
    146 	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
    147 	.ENTRY
    148 	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
    149 	$PUSHMA	%r3,$FRAME(%sp)
    150 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
    151 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
    152 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
    153 
    154 	cmpib,*= 0,$len,L\$abort
    155 	sub	$inp,$out,$inp		; distance between $inp and $out
    156 
    157 	$LD	`0*$SZ`($key),$XX[0]
    158 	$LD	`1*$SZ`($key),$YY
    159 	ldo	`2*$SZ`($key),$key
    160 
    161 	ldi	0xff,$mask
    162 	ldi	3,$dat0		
    163 
    164 	ldo	1($XX[0]),$XX[0]	; warm up loop
    165 	and	$mask,$XX[0],$XX[0]
    166 	$LDX	$XX[0]($key),$TX[0]
    167 	addl	$TX[0],$YY,$YY
    168 	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
    169 	and	$mask,$YY,$YY
    170 
    171 	and,<>	$out,$dat0,$rem		; is $out aligned?
    172 	b	L\$alignedout
    173 	subi	4,$rem,$rem
    174 	sub	$len,$rem,$len
    175 ___
    176 &foldedloop("L\$alignout",$rem);	# process till $out is aligned
    177 
    178 $code.=<<___;
    179 L\$alignedout				; $len is at least 4 here
    180 	and,<>	$inp,$dat0,$acc		; is $inp aligned?
    181 	b	L\$oop4
    182 	sub	$inp,$acc,$rem		; align $inp
    183 
    184 	sh3addl	$acc,%r0,$acc
    185 	subi	32,$acc,$acc
    186 	mtctl	$acc,%cr11		; load %sar with vshd align factor
    187 	ldwx	$rem($out),$dat0
    188 	ldo	4($rem),$rem
    189 L\$oop4misalignedinp
    190 ___
    191 &unrolledloopbody();
    192 $code.=<<___;
    193 	$LDX	$TY($key),$ix
    194 	ldwx	$rem($out),$dat1
    195 	ldo	-4($len),$len
    196 	or	$ix,$acc,$acc		; last piece, no need to dep
    197 	vshd	$dat0,$dat1,$iy		; align data
    198 	copy	$dat1,$dat0
    199 	xor	$iy,$acc,$acc
    200 	stw	$acc,0($out)
    201 	cmpib,*<< 3,$len,L\$oop4misalignedinp
    202 	ldo	4($out),$out
    203 	cmpib,*= 0,$len,L\$done
    204 	nop
    205 	b	L\$oop1
    206 	nop
    207 
    208 	.ALIGN	8
    209 L\$oop4
    210 ___
    211 &unrolledloopbody();
    212 $code.=<<___;
    213 	$LDX	$TY($key),$ix
    214 	ldwx	$inp($out),$dat0
    215 	ldo	-4($len),$len
    216 	or	$ix,$acc,$acc		; last piece, no need to dep
    217 	xor	$dat0,$acc,$acc
    218 	stw	$acc,0($out)
    219 	cmpib,*<< 3,$len,L\$oop4
    220 	ldo	4($out),$out
    221 	cmpib,*= 0,$len,L\$done
    222 	nop
    223 ___
    224 &foldedloop("L\$oop1",$len);
    225 $code.=<<___;
    226 L\$done
    227 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
    228 	ldo	-1($XX[0]),$XX[0]	; chill out loop
    229 	sub	$YY,$TX[0],$YY
    230 	and	$mask,$XX[0],$XX[0]
    231 	and	$mask,$YY,$YY
    232 	$ST	$XX[0],`-2*$SZ`($key)
    233 	$ST	$YY,`-1*$SZ`($key)
    234 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
    235 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
    236 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
    237 L\$abort
    238 	bv	(%r2)
    239 	.EXIT
    240 	$POPMB	-$FRAME(%sp),%r3
    241 	.PROCEND
    242 ___
    243 
    244 $code.=<<___;
    245 
    246 	.EXPORT	private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
    247 	.ALIGN	8
    248 private_RC4_set_key
    249 	.PROC
    250 	.CALLINFO	NO_CALLS
    251 	.ENTRY
    252 	$ST	%r0,`0*$SZ`($key)
    253 	$ST	%r0,`1*$SZ`($key)
    254 	ldo	`2*$SZ`($key),$key
    255 	copy	%r0,@XX[0]
    256 L\$1st
    257 	$ST	@XX[0],0($key)
    258 	ldo	1(@XX[0]),@XX[0]
    259 	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
    260 	ldo	$SZ($key),$key
    261 
    262 	ldo	`-256*$SZ`($key),$key	; rewind $key
    263 	addl	$len,$inp,$inp		; $inp to point at the end
    264 	sub	%r0,$len,%r23		; inverse index
    265 	copy	%r0,@XX[0]
    266 	copy	%r0,@XX[1]
    267 	ldi	0xff,$mask
    268 
    269 L\$2nd
    270 	$LDX	@XX[0]($key),@TX[0]
    271 	ldbx	%r23($inp),@TX[1]
    272 	addi,nuv 1,%r23,%r23		; increment and conditional
    273 	sub	%r0,$len,%r23		; inverse index
    274 	addl	@TX[0],@XX[1],@XX[1]
    275 	addl	@TX[1],@XX[1],@XX[1]
    276 	and	$mask,@XX[1],@XX[1]
    277 	$MKX	@XX[0],$key,$TY
    278 	$LDX	@XX[1]($key),@TX[1]
    279 	$MKX	@XX[1],$key,$YY
    280 	ldo	1(@XX[0]),@XX[0]
    281 	$ST	@TX[0],0($YY)
    282 	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
    283 	$ST	@TX[1],0($TY)
    284 
    285 	bv,n	(%r2)
    286 	.EXIT
    287 	nop
    288 	.PROCEND
    289 
    290 	.EXPORT	RC4_options,ENTRY
    291 	.ALIGN	8
    292 RC4_options
    293 	.PROC
    294 	.CALLINFO	NO_CALLS
    295 	.ENTRY
    296 	blr	%r0,%r28
    297 	ldi	3,%r1
    298 L\$pic
    299 	andcm	%r28,%r1,%r28
    300 	bv	(%r2)
    301 	.EXIT
    302 	ldo	L\$opts-L\$pic(%r28),%r28
    303 	.PROCEND
    304 	.ALIGN	8
    305 L\$opts
    306 	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
    307 	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
    308 ___
    309 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    310 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
    311 
    312 print $code;
    313 close STDOUT;
    314