Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # This module doesn't present direct interest for OpenSSL, because it
     11 # doesn't provide better performance for longer keys, at least not on
     12 # in-order-execution cores. While 512-bit RSA sign operations can be
     13 # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
     14 # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
     15 # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
     16 # verify:-( All comparisons are against bn_mul_mont-free assembler.
     17 # The module might be of interest to embedded system developers, as
     18 # the code is smaller than 1KB, yet offers >3x improvement on MIPS64
     19 # and 75-30% [less for longer keys] on MIPS32 over compiler-generated
     20 # code.
     21 
     22 ######################################################################
     23 # There is a number of MIPS ABI in use, O32 and N32/64 are most
     24 # widely used. Then there is a new contender: NUBI. It appears that if
     25 # one picks the latter, it's possible to arrange code in ABI neutral
     26 # manner. Therefore let's stick to NUBI register layout:
     27 #
     28 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
     29 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
     30 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
     31 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
     32 #
     33 # The return value is placed in $a0. Following coding rules facilitate
     34 # interoperability:
     35 #
     36 # - never ever touch $tp, "thread pointer", former $gp;
     37 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
     38 #   old code];
     39 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
     40 #
     41 # For reference here is register layout for N32/64 MIPS ABIs:
     42 #
     43 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
     44 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
     45 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
     46 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
     47 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
     48 #
     49 $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
     50 
     51 if ($flavour =~ /64|n32/i) {
     52 	$PTR_ADD="dadd";	# incidentally works even on n32
     53 	$PTR_SUB="dsub";	# incidentally works even on n32
     54 	$REG_S="sd";
     55 	$REG_L="ld";
     56 	$SZREG=8;
     57 } else {
     58 	$PTR_ADD="add";
     59 	$PTR_SUB="sub";
     60 	$REG_S="sw";
     61 	$REG_L="lw";
     62 	$SZREG=4;
     63 }
     64 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
     65 #
     66 # <appro (at] openssl.org>
     67 #
     68 ######################################################################
     69 
     70 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     71 open STDOUT,">$output";
     72 
     73 if ($flavour =~ /64|n32/i) {
     74 	$LD="ld";
     75 	$ST="sd";
     76 	$MULTU="dmultu";
     77 	$ADDU="daddu";
     78 	$SUBU="dsubu";
     79 	$BNSZ=8;
     80 } else {
     81 	$LD="lw";
     82 	$ST="sw";
     83 	$MULTU="multu";
     84 	$ADDU="addu";
     85 	$SUBU="subu";
     86 	$BNSZ=4;
     87 }
     88 
     89 # int bn_mul_mont(
     90 $rp=$a0;	# BN_ULONG *rp,
     91 $ap=$a1;	# const BN_ULONG *ap,
     92 $bp=$a2;	# const BN_ULONG *bp,
     93 $np=$a3;	# const BN_ULONG *np,
     94 $n0=$a4;	# const BN_ULONG *n0,
     95 $num=$a5;	# int num);
     96 
     97 $lo0=$a6;
     98 $hi0=$a7;
     99 $lo1=$t1;
    100 $hi1=$t2;
    101 $aj=$s0;
    102 $bi=$s1;
    103 $nj=$s2;
    104 $tp=$s3;
    105 $alo=$s4;
    106 $ahi=$s5;
    107 $nlo=$s6;
    108 $nhi=$s7;
    109 $tj=$s8;
    110 $i=$s9;
    111 $j=$s10;
    112 $m1=$s11;
    113 
    114 $FRAMESIZE=14;
    115 
    116 $code=<<___;
    117 .text
    118 
    119 .set	noat
    120 .set	noreorder
    121 
    122 .align	5
    123 .globl	bn_mul_mont
    124 .ent	bn_mul_mont
    125 bn_mul_mont:
    126 ___
    127 $code.=<<___ if ($flavour =~ /o32/i);
    128 	lw	$n0,16($sp)
    129 	lw	$num,20($sp)
    130 ___
    131 $code.=<<___;
    132 	slt	$at,$num,4
    133 	bnez	$at,1f
    134 	li	$t0,0
    135 	slt	$at,$num,17	# on in-order CPU
    136 	bnezl	$at,bn_mul_mont_internal
    137 	nop
    138 1:	jr	$ra
    139 	li	$a0,0
    140 .end	bn_mul_mont
    141 
    142 .align	5
    143 .ent	bn_mul_mont_internal
    144 bn_mul_mont_internal:
    145 	.frame	$fp,$FRAMESIZE*$SZREG,$ra
    146 	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
    147 	$PTR_SUB $sp,$FRAMESIZE*$SZREG
    148 	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
    149 	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
    150 	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
    151 	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
    152 	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
    153 	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
    154 	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
    155 	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
    156 	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
    157 ___
    158 $code.=<<___ if ($flavour =~ /nubi/i);
    159 	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
    160 	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
    161 	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
    162 	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
    163 ___
    164 $code.=<<___;
    165 	move	$fp,$sp
    166 
    167 	.set	reorder
    168 	$LD	$n0,0($n0)
    169 	$LD	$bi,0($bp)	# bp[0]
    170 	$LD	$aj,0($ap)	# ap[0]
    171 	$LD	$nj,0($np)	# np[0]
    172 
    173 	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
    174 	sll	$num,`log($BNSZ)/log(2)`
    175 	li	$at,-4096
    176 	$PTR_SUB $sp,$num
    177 	and	$sp,$at
    178 
    179 	$MULTU	$aj,$bi
    180 	$LD	$alo,$BNSZ($ap)
    181 	$LD	$nlo,$BNSZ($np)
    182 	mflo	$lo0
    183 	mfhi	$hi0
    184 	$MULTU	$lo0,$n0
    185 	mflo	$m1
    186 
    187 	$MULTU	$alo,$bi
    188 	mflo	$alo
    189 	mfhi	$ahi
    190 
    191 	$MULTU	$nj,$m1
    192 	mflo	$lo1
    193 	mfhi	$hi1
    194 	$MULTU	$nlo,$m1
    195 	$ADDU	$lo1,$lo0
    196 	sltu	$at,$lo1,$lo0
    197 	$ADDU	$hi1,$at
    198 	mflo	$nlo
    199 	mfhi	$nhi
    200 
    201 	move	$tp,$sp
    202 	li	$j,2*$BNSZ
    203 .align	4
    204 .L1st:
    205 	.set	noreorder
    206 	$PTR_ADD $aj,$ap,$j
    207 	$PTR_ADD $nj,$np,$j
    208 	$LD	$aj,($aj)
    209 	$LD	$nj,($nj)
    210 
    211 	$MULTU	$aj,$bi
    212 	$ADDU	$lo0,$alo,$hi0
    213 	$ADDU	$lo1,$nlo,$hi1
    214 	sltu	$at,$lo0,$hi0
    215 	sltu	$t0,$lo1,$hi1
    216 	$ADDU	$hi0,$ahi,$at
    217 	$ADDU	$hi1,$nhi,$t0
    218 	mflo	$alo
    219 	mfhi	$ahi
    220 
    221 	$ADDU	$lo1,$lo0
    222 	sltu	$at,$lo1,$lo0
    223 	$MULTU	$nj,$m1
    224 	$ADDU	$hi1,$at
    225 	addu	$j,$BNSZ
    226 	$ST	$lo1,($tp)
    227 	sltu	$t0,$j,$num
    228 	mflo	$nlo
    229 	mfhi	$nhi
    230 
    231 	bnez	$t0,.L1st
    232 	$PTR_ADD $tp,$BNSZ
    233 	.set	reorder
    234 
    235 	$ADDU	$lo0,$alo,$hi0
    236 	sltu	$at,$lo0,$hi0
    237 	$ADDU	$hi0,$ahi,$at
    238 
    239 	$ADDU	$lo1,$nlo,$hi1
    240 	sltu	$t0,$lo1,$hi1
    241 	$ADDU	$hi1,$nhi,$t0
    242 	$ADDU	$lo1,$lo0
    243 	sltu	$at,$lo1,$lo0
    244 	$ADDU	$hi1,$at
    245 
    246 	$ST	$lo1,($tp)
    247 
    248 	$ADDU	$hi1,$hi0
    249 	sltu	$at,$hi1,$hi0
    250 	$ST	$hi1,$BNSZ($tp)
    251 	$ST	$at,2*$BNSZ($tp)
    252 
    253 	li	$i,$BNSZ
    254 .align	4
    255 .Louter:
    256 	$PTR_ADD $bi,$bp,$i
    257 	$LD	$bi,($bi)
    258 	$LD	$aj,($ap)
    259 	$LD	$alo,$BNSZ($ap)
    260 	$LD	$tj,($sp)
    261 
    262 	$MULTU	$aj,$bi
    263 	$LD	$nj,($np)
    264 	$LD	$nlo,$BNSZ($np)
    265 	mflo	$lo0
    266 	mfhi	$hi0
    267 	$ADDU	$lo0,$tj
    268 	$MULTU	$lo0,$n0
    269 	sltu	$at,$lo0,$tj
    270 	$ADDU	$hi0,$at
    271 	mflo	$m1
    272 
    273 	$MULTU	$alo,$bi
    274 	mflo	$alo
    275 	mfhi	$ahi
    276 
    277 	$MULTU	$nj,$m1
    278 	mflo	$lo1
    279 	mfhi	$hi1
    280 
    281 	$MULTU	$nlo,$m1
    282 	$ADDU	$lo1,$lo0
    283 	sltu	$at,$lo1,$lo0
    284 	$ADDU	$hi1,$at
    285 	mflo	$nlo
    286 	mfhi	$nhi
    287 
    288 	move	$tp,$sp
    289 	li	$j,2*$BNSZ
    290 	$LD	$tj,$BNSZ($tp)
    291 .align	4
    292 .Linner:
    293 	.set	noreorder
    294 	$PTR_ADD $aj,$ap,$j
    295 	$PTR_ADD $nj,$np,$j
    296 	$LD	$aj,($aj)
    297 	$LD	$nj,($nj)
    298 
    299 	$MULTU	$aj,$bi
    300 	$ADDU	$lo0,$alo,$hi0
    301 	$ADDU	$lo1,$nlo,$hi1
    302 	sltu	$at,$lo0,$hi0
    303 	sltu	$t0,$lo1,$hi1
    304 	$ADDU	$hi0,$ahi,$at
    305 	$ADDU	$hi1,$nhi,$t0
    306 	mflo	$alo
    307 	mfhi	$ahi
    308 
    309 	$ADDU	$lo0,$tj
    310 	addu	$j,$BNSZ
    311 	$MULTU	$nj,$m1
    312 	sltu	$at,$lo0,$tj
    313 	$ADDU	$lo1,$lo0
    314 	$ADDU	$hi0,$at
    315 	sltu	$t0,$lo1,$lo0
    316 	$LD	$tj,2*$BNSZ($tp)
    317 	$ADDU	$hi1,$t0
    318 	sltu	$at,$j,$num
    319 	mflo	$nlo
    320 	mfhi	$nhi
    321 	$ST	$lo1,($tp)
    322 	bnez	$at,.Linner
    323 	$PTR_ADD $tp,$BNSZ
    324 	.set	reorder
    325 
    326 	$ADDU	$lo0,$alo,$hi0
    327 	sltu	$at,$lo0,$hi0
    328 	$ADDU	$hi0,$ahi,$at
    329 	$ADDU	$lo0,$tj
    330 	sltu	$t0,$lo0,$tj
    331 	$ADDU	$hi0,$t0
    332 
    333 	$LD	$tj,2*$BNSZ($tp)
    334 	$ADDU	$lo1,$nlo,$hi1
    335 	sltu	$at,$lo1,$hi1
    336 	$ADDU	$hi1,$nhi,$at
    337 	$ADDU	$lo1,$lo0
    338 	sltu	$t0,$lo1,$lo0
    339 	$ADDU	$hi1,$t0
    340 	$ST	$lo1,($tp)
    341 
    342 	$ADDU	$lo1,$hi1,$hi0
    343 	sltu	$hi1,$lo1,$hi0
    344 	$ADDU	$lo1,$tj
    345 	sltu	$at,$lo1,$tj
    346 	$ADDU	$hi1,$at
    347 	$ST	$lo1,$BNSZ($tp)
    348 	$ST	$hi1,2*$BNSZ($tp)
    349 
    350 	addu	$i,$BNSZ
    351 	sltu	$t0,$i,$num
    352 	bnez	$t0,.Louter
    353 
    355 	.set	noreorder
    356 	$PTR_ADD $tj,$sp,$num	# &tp[num]
    357 	move	$tp,$sp
    358 	move	$ap,$sp
    359 	li	$hi0,0		# clear borrow bit
    360 
    361 .align	4
    362 .Lsub:	$LD	$lo0,($tp)
    363 	$LD	$lo1,($np)
    364 	$PTR_ADD $tp,$BNSZ
    365 	$PTR_ADD $np,$BNSZ
    366 	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
    367 	sgtu	$at,$lo1,$lo0
    368 	$SUBU	$lo0,$lo1,$hi0
    369 	sgtu	$hi0,$lo0,$lo1
    370 	$ST	$lo0,($rp)
    371 	or	$hi0,$at
    372 	sltu	$at,$tp,$tj
    373 	bnez	$at,.Lsub
    374 	$PTR_ADD $rp,$BNSZ
    375 
    376 	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
    377 	move	$tp,$sp
    378 	$PTR_SUB $rp,$num	# restore rp
    379 	not	$hi1,$hi0
    380 
    381 	and	$ap,$hi0,$sp
    382 	and	$bp,$hi1,$rp
    383 	or	$ap,$ap,$bp	# ap=borrow?tp:rp
    384 
    385 .align	4
    386 .Lcopy:	$LD	$aj,($ap)
    387 	$PTR_ADD $ap,$BNSZ
    388 	$ST	$zero,($tp)
    389 	$PTR_ADD $tp,$BNSZ
    390 	sltu	$at,$tp,$tj
    391 	$ST	$aj,($rp)
    392 	bnez	$at,.Lcopy
    393 	$PTR_ADD $rp,$BNSZ
    394 
    395 	li	$a0,1
    396 	li	$t0,1
    397 
    398 	.set	noreorder
    399 	move	$sp,$fp
    400 	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
    401 	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
    402 	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
    403 	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
    404 	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
    405 	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
    406 	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
    407 	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
    408 	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
    409 ___
    410 $code.=<<___ if ($flavour =~ /nubi/i);
    411 	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
    412 	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
    413 	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
    414 	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
    415 ___
    416 $code.=<<___;
    417 	jr	$ra
    418 	$PTR_ADD $sp,$FRAMESIZE*$SZREG
    419 .end	bn_mul_mont_internal
    420 .rdata
    421 .asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
    422 ___
    423 
    424 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    425 
    426 print $code;
    427 close STDOUT;
    428