Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
     11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
     12 # instructed to '-tune host' code with in-line assembler. Other
     13 # benchmarks improve by 15-20%. To anchor it to something else, the
     14 # code provides approximately the same performance per GHz as AMD64.
     15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
     16 # difference.
     17 
     18 # int bn_mul_mont(
     19 $rp="a0";	# BN_ULONG *rp,
     20 $ap="a1";	# const BN_ULONG *ap,
     21 $bp="a2";	# const BN_ULONG *bp,
     22 $np="a3";	# const BN_ULONG *np,
     23 $n0="a4";	# const BN_ULONG *n0,
     24 $num="a5";	# int num);
     25 
     26 $lo0="t0";
     27 $hi0="t1";
     28 $lo1="t2";
     29 $hi1="t3";
     30 $aj="t4";
     31 $bi="t5";
     32 $nj="t6";
     33 $tp="t7";
     34 $alo="t8";
     35 $ahi="t9";
     36 $nlo="t10";
     37 $nhi="t11";
     38 $tj="t12";
     39 $i="s3";
     40 $j="s4";
     41 $m1="s5";
     42 
     43 $code=<<___;
     44 #ifdef __linux__
     45 #include <asm/regdef.h>
     46 #else
     47 #include <asm.h>
     48 #include <regdef.h>
     49 #endif
     50 
     51 .text
     52 
     53 .set	noat
     54 .set	noreorder
     55 
     56 .globl	bn_mul_mont
     57 .align	5
     58 .ent	bn_mul_mont
     59 bn_mul_mont:
     60 	lda	sp,-48(sp)
     61 	stq	ra,0(sp)
     62 	stq	s3,8(sp)
     63 	stq	s4,16(sp)
     64 	stq	s5,24(sp)
     65 	stq	fp,32(sp)
     66 	mov	sp,fp
     67 	.mask	0x0400f000,-48
     68 	.frame	fp,48,ra
     69 	.prologue 0
     70 
     71 	.align	4
     72 	.set	reorder
     73 	sextl	$num,$num
     74 	mov	0,v0
     75 	cmplt	$num,4,AT
     76 	bne	AT,.Lexit
     77 
     78 	ldq	$hi0,0($ap)	# ap[0]
     79 	s8addq	$num,16,AT
     80 	ldq	$aj,8($ap)
     81 	subq	sp,AT,sp
     82 	ldq	$bi,0($bp)	# bp[0]
     83 	lda	AT,-4096(zero)	# mov	-4096,AT
     84 	ldq	$n0,0($n0)
     85 	and	sp,AT,sp
     86 
     87 	mulq	$hi0,$bi,$lo0
     88 	ldq	$hi1,0($np)	# np[0]
     89 	umulh	$hi0,$bi,$hi0
     90 	ldq	$nj,8($np)
     91 
     92 	mulq	$lo0,$n0,$m1
     93 
     94 	mulq	$hi1,$m1,$lo1
     95 	umulh	$hi1,$m1,$hi1
     96 
     97 	addq	$lo1,$lo0,$lo1
     98 	cmpult	$lo1,$lo0,AT
     99 	addq	$hi1,AT,$hi1
    100 
    101 	mulq	$aj,$bi,$alo
    102 	mov	2,$j
    103 	umulh	$aj,$bi,$ahi
    104 	mov	sp,$tp
    105 
    106 	mulq	$nj,$m1,$nlo
    107 	s8addq	$j,$ap,$aj
    108 	umulh	$nj,$m1,$nhi
    109 	s8addq	$j,$np,$nj
    110 .align	4
    111 .L1st:
    112 	.set	noreorder
    113 	ldq	$aj,0($aj)
    114 	addl	$j,1,$j
    115 	ldq	$nj,0($nj)
    116 	lda	$tp,8($tp)
    117 
    118 	addq	$alo,$hi0,$lo0
    119 	mulq	$aj,$bi,$alo
    120 	cmpult	$lo0,$hi0,AT
    121 	addq	$nlo,$hi1,$lo1
    122 
    123 	mulq	$nj,$m1,$nlo
    124 	addq	$ahi,AT,$hi0
    125 	cmpult	$lo1,$hi1,v0
    126 	cmplt	$j,$num,$tj
    127 
    128 	umulh	$aj,$bi,$ahi
    129 	addq	$nhi,v0,$hi1
    130 	addq	$lo1,$lo0,$lo1
    131 	s8addq	$j,$ap,$aj
    132 
    133 	umulh	$nj,$m1,$nhi
    134 	cmpult	$lo1,$lo0,v0
    135 	addq	$hi1,v0,$hi1
    136 	s8addq	$j,$np,$nj
    137 
    138 	stq	$lo1,-8($tp)
    139 	nop
    140 	unop
    141 	bne	$tj,.L1st
    142 	.set	reorder
    143 
    144 	addq	$alo,$hi0,$lo0
    145 	addq	$nlo,$hi1,$lo1
    146 	cmpult	$lo0,$hi0,AT
    147 	cmpult	$lo1,$hi1,v0
    148 	addq	$ahi,AT,$hi0
    149 	addq	$nhi,v0,$hi1
    150 
    151 	addq	$lo1,$lo0,$lo1
    152 	cmpult	$lo1,$lo0,v0
    153 	addq	$hi1,v0,$hi1
    154 
    155 	stq	$lo1,0($tp)
    156 
    157 	addq	$hi1,$hi0,$hi1
    158 	cmpult	$hi1,$hi0,AT
    159 	stq	$hi1,8($tp)
    160 	stq	AT,16($tp)
    161 
    162 	mov	1,$i
    163 .align	4
    164 .Louter:
    165 	s8addq	$i,$bp,$bi
    166 	ldq	$hi0,0($ap)
    167 	ldq	$aj,8($ap)
    168 	ldq	$bi,0($bi)
    169 	ldq	$hi1,0($np)
    170 	ldq	$nj,8($np)
    171 	ldq	$tj,0(sp)
    172 
    173 	mulq	$hi0,$bi,$lo0
    174 	umulh	$hi0,$bi,$hi0
    175 
    176 	addq	$lo0,$tj,$lo0
    177 	cmpult	$lo0,$tj,AT
    178 	addq	$hi0,AT,$hi0
    179 
    180 	mulq	$lo0,$n0,$m1
    181 
    182 	mulq	$hi1,$m1,$lo1
    183 	umulh	$hi1,$m1,$hi1
    184 
    185 	addq	$lo1,$lo0,$lo1
    186 	cmpult	$lo1,$lo0,AT
    187 	mov	2,$j
    188 	addq	$hi1,AT,$hi1
    189 
    190 	mulq	$aj,$bi,$alo
    191 	mov	sp,$tp
    192 	umulh	$aj,$bi,$ahi
    193 
    194 	mulq	$nj,$m1,$nlo
    195 	s8addq	$j,$ap,$aj
    196 	umulh	$nj,$m1,$nhi
    197 .align	4
    198 .Linner:
    199 	.set	noreorder
    200 	ldq	$tj,8($tp)	#L0
    201 	nop			#U1
    202 	ldq	$aj,0($aj)	#L1
    203 	s8addq	$j,$np,$nj	#U0
    204 
    205 	ldq	$nj,0($nj)	#L0
    206 	nop			#U1
    207 	addq	$alo,$hi0,$lo0	#L1
    208 	lda	$tp,8($tp)
    209 
    210 	mulq	$aj,$bi,$alo	#U1
    211 	cmpult	$lo0,$hi0,AT	#L0
    212 	addq	$nlo,$hi1,$lo1	#L1
    213 	addl	$j,1,$j
    214 
    215 	mulq	$nj,$m1,$nlo	#U1
    216 	addq	$ahi,AT,$hi0	#L0
    217 	addq	$lo0,$tj,$lo0	#L1
    218 	cmpult	$lo1,$hi1,v0	#U0
    219 
    220 	umulh	$aj,$bi,$ahi	#U1
    221 	cmpult	$lo0,$tj,AT	#L0
    222 	addq	$lo1,$lo0,$lo1	#L1
    223 	addq	$nhi,v0,$hi1	#U0
    224 
    225 	umulh	$nj,$m1,$nhi	#U1
    226 	s8addq	$j,$ap,$aj	#L0
    227 	cmpult	$lo1,$lo0,v0	#L1
    228 	cmplt	$j,$num,$tj	#U0	# borrow $tj
    229 
    230 	addq	$hi0,AT,$hi0	#L0
    231 	addq	$hi1,v0,$hi1	#U1
    232 	stq	$lo1,-8($tp)	#L1
    233 	bne	$tj,.Linner	#U0
    234 	.set	reorder
    235 
    236 	ldq	$tj,8($tp)
    237 	addq	$alo,$hi0,$lo0
    238 	addq	$nlo,$hi1,$lo1
    239 	cmpult	$lo0,$hi0,AT
    240 	cmpult	$lo1,$hi1,v0
    241 	addq	$ahi,AT,$hi0
    242 	addq	$nhi,v0,$hi1
    243 
    244 	addq	$lo0,$tj,$lo0
    245 	cmpult	$lo0,$tj,AT
    246 	addq	$hi0,AT,$hi0
    247 
    248 	ldq	$tj,16($tp)
    249 	addq	$lo1,$lo0,$j
    250 	cmpult	$j,$lo0,v0
    251 	addq	$hi1,v0,$hi1
    252 
    253 	addq	$hi1,$hi0,$lo1
    254 	stq	$j,0($tp)
    255 	cmpult	$lo1,$hi0,$hi1
    256 	addq	$lo1,$tj,$lo1
    257 	cmpult	$lo1,$tj,AT
    258 	addl	$i,1,$i
    259 	addq	$hi1,AT,$hi1
    260 	stq	$lo1,8($tp)
    261 	cmplt	$i,$num,$tj	# borrow $tj
    262 	stq	$hi1,16($tp)
    263 	bne	$tj,.Louter
    264 
    266 	s8addq	$num,sp,$tj	# &tp[num]
    267 	mov	$rp,$bp		# put rp aside
    268 	mov	sp,$tp
    269 	mov	sp,$ap
    270 	mov	0,$hi0		# clear borrow bit
    271 
    272 .align	4
    273 .Lsub:	ldq	$lo0,0($tp)
    274 	ldq	$lo1,0($np)
    275 	lda	$tp,8($tp)
    276 	lda	$np,8($np)
    277 	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
    278 	cmpult	$lo0,$lo1,AT
    279 	subq	$lo1,$hi0,$lo0
    280 	cmpult	$lo1,$lo0,$hi0
    281 	or	$hi0,AT,$hi0
    282 	stq	$lo0,0($rp)
    283 	cmpult	$tp,$tj,v0
    284 	lda	$rp,8($rp)
    285 	bne	v0,.Lsub
    286 
    287 	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
    288 	mov	sp,$tp
    289 	mov	$bp,$rp		# restore rp
    290 
    291 	and	sp,$hi0,$ap
    292 	bic	$bp,$hi0,$bp
    293 	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
    294 
    295 .align	4
    296 .Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
    297 	lda	$tp,8($tp)
    298 	lda	$rp,8($rp)
    299 	lda	$ap,8($ap)
    300 	stq	zero,-8($tp)	# zap tp
    301 	cmpult	$tp,$tj,AT
    302 	stq	$aj,-8($rp)
    303 	bne	AT,.Lcopy
    304 	mov	1,v0
    305 
    306 .Lexit:
    307 	.set	noreorder
    308 	mov	fp,sp
    309 	/*ldq	ra,0(sp)*/
    310 	ldq	s3,8(sp)
    311 	ldq	s4,16(sp)
    312 	ldq	s5,24(sp)
    313 	ldq	fp,32(sp)
    314 	lda	sp,48(sp)
    315 	ret	(ra)
    316 .end	bn_mul_mont
    317 .ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
    318 .align	2
    319 ___
    320 
    321 print $code;
    322 close STDOUT;
    323