1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # This module doesn't present direct interest for OpenSSL, because it 11 # doesn't provide better performance for longer keys, at least not on 12 # in-order-execution cores. While 512-bit RSA sign operations can be 13 # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and 14 # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from 15 # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA 16 # verify:-( All comparisons are against bn_mul_mont-free assembler. 17 # The module might be of interest to embedded system developers, as 18 # the code is smaller than 1KB, yet offers >3x improvement on MIPS64 19 # and 75-30% [less for longer keys] on MIPS32 over compiler-generated 20 # code. 21 22 ###################################################################### 23 # There is a number of MIPS ABI in use, O32 and N32/64 are most 24 # widely used. Then there is a new contender: NUBI. It appears that if 25 # one picks the latter, it's possible to arrange code in ABI neutral 26 # manner. Therefore let's stick to NUBI register layout: 27 # 28 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 29 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 30 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 31 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 32 # 33 # The return value is placed in $a0. Following coding rules facilitate 34 # interoperability: 35 # 36 # - never ever touch $tp, "thread pointer", former $gp; 37 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting 38 # old code]; 39 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 40 # 41 # For reference here is register layout for N32/64 MIPS ABIs: 42 # 43 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 44 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 45 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 46 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 47 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 48 # 49 $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 50 51 if ($flavour =~ /64|n32/i) { 52 $PTR_ADD="dadd"; # incidentally works even on n32 53 $PTR_SUB="dsub"; # incidentally works even on n32 54 $REG_S="sd"; 55 $REG_L="ld"; 56 $SZREG=8; 57 } else { 58 $PTR_ADD="add"; 59 $PTR_SUB="sub"; 60 $REG_S="sw"; 61 $REG_L="lw"; 62 $SZREG=4; 63 } 64 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; 65 # 66 # <appro (at] openssl.org> 67 # 68 ###################################################################### 69 70 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 71 open STDOUT,">$output"; 72 73 if ($flavour =~ /64|n32/i) { 74 $LD="ld"; 75 $ST="sd"; 76 $MULTU="dmultu"; 77 $ADDU="daddu"; 78 $SUBU="dsubu"; 79 $BNSZ=8; 80 } else { 81 $LD="lw"; 82 $ST="sw"; 83 $MULTU="multu"; 84 $ADDU="addu"; 85 $SUBU="subu"; 86 $BNSZ=4; 87 } 88 89 # int bn_mul_mont( 90 $rp=$a0; # BN_ULONG *rp, 91 $ap=$a1; # const BN_ULONG *ap, 92 $bp=$a2; # const BN_ULONG *bp, 93 $np=$a3; # const BN_ULONG *np, 94 $n0=$a4; # const BN_ULONG *n0, 95 $num=$a5; # int num); 96 97 $lo0=$a6; 98 $hi0=$a7; 99 $lo1=$t1; 100 $hi1=$t2; 101 $aj=$s0; 102 $bi=$s1; 103 $nj=$s2; 104 $tp=$s3; 105 $alo=$s4; 106 $ahi=$s5; 107 $nlo=$s6; 108 $nhi=$s7; 109 $tj=$s8; 110 $i=$s9; 111 $j=$s10; 112 $m1=$s11; 113 114 $FRAMESIZE=14; 115 116 $code=<<___; 117 .text 118 119 .set noat 120 .set noreorder 121 122 .align 5 123 .globl bn_mul_mont 124 .ent bn_mul_mont 125 bn_mul_mont: 126 ___ 127 $code.=<<___ if ($flavour =~ /o32/i); 128 lw $n0,16($sp) 129 lw $num,20($sp) 130 ___ 131 $code.=<<___; 132 slt $at,$num,4 133 bnez $at,1f 134 li $t0,0 135 slt $at,$num,17 # on in-order CPU 136 bnezl $at,bn_mul_mont_internal 137 nop 138 1: jr $ra 139 li $a0,0 140 .end bn_mul_mont 141 142 .align 5 143 .ent bn_mul_mont_internal 144 bn_mul_mont_internal: 145 .frame $fp,$FRAMESIZE*$SZREG,$ra 146 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG 147 $PTR_SUB $sp,$FRAMESIZE*$SZREG 148 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) 149 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) 150 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) 151 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) 152 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) 153 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) 154 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) 155 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) 156 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) 157 ___ 158 $code.=<<___ if ($flavour =~ /nubi/i); 159 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) 160 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) 161 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) 162 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) 163 ___ 164 $code.=<<___; 165 move $fp,$sp 166 167 .set reorder 168 $LD $n0,0($n0) 169 $LD $bi,0($bp) # bp[0] 170 $LD $aj,0($ap) # ap[0] 171 $LD $nj,0($np) # np[0] 172 173 $PTR_SUB $sp,2*$BNSZ # place for two extra words 174 sll $num,`log($BNSZ)/log(2)` 175 li $at,-4096 176 $PTR_SUB $sp,$num 177 and $sp,$at 178 179 $MULTU $aj,$bi 180 $LD $alo,$BNSZ($ap) 181 $LD $nlo,$BNSZ($np) 182 mflo $lo0 183 mfhi $hi0 184 $MULTU $lo0,$n0 185 mflo $m1 186 187 $MULTU $alo,$bi 188 mflo $alo 189 mfhi $ahi 190 191 $MULTU $nj,$m1 192 mflo $lo1 193 mfhi $hi1 194 $MULTU $nlo,$m1 195 $ADDU $lo1,$lo0 196 sltu $at,$lo1,$lo0 197 $ADDU $hi1,$at 198 mflo $nlo 199 mfhi $nhi 200 201 move $tp,$sp 202 li $j,2*$BNSZ 203 .align 4 204 .L1st: 205 .set noreorder 206 $PTR_ADD $aj,$ap,$j 207 $PTR_ADD $nj,$np,$j 208 $LD $aj,($aj) 209 $LD $nj,($nj) 210 211 $MULTU $aj,$bi 212 $ADDU $lo0,$alo,$hi0 213 $ADDU $lo1,$nlo,$hi1 214 sltu $at,$lo0,$hi0 215 sltu $t0,$lo1,$hi1 216 $ADDU $hi0,$ahi,$at 217 $ADDU $hi1,$nhi,$t0 218 mflo $alo 219 mfhi $ahi 220 221 $ADDU $lo1,$lo0 222 sltu $at,$lo1,$lo0 223 $MULTU $nj,$m1 224 $ADDU $hi1,$at 225 addu $j,$BNSZ 226 $ST $lo1,($tp) 227 sltu $t0,$j,$num 228 mflo $nlo 229 mfhi $nhi 230 231 bnez $t0,.L1st 232 $PTR_ADD $tp,$BNSZ 233 .set reorder 234 235 $ADDU $lo0,$alo,$hi0 236 sltu $at,$lo0,$hi0 237 $ADDU $hi0,$ahi,$at 238 239 $ADDU $lo1,$nlo,$hi1 240 sltu $t0,$lo1,$hi1 241 $ADDU $hi1,$nhi,$t0 242 $ADDU $lo1,$lo0 243 sltu $at,$lo1,$lo0 244 $ADDU $hi1,$at 245 246 $ST $lo1,($tp) 247 248 $ADDU $hi1,$hi0 249 sltu $at,$hi1,$hi0 250 $ST $hi1,$BNSZ($tp) 251 $ST $at,2*$BNSZ($tp) 252 253 li $i,$BNSZ 254 .align 4 255 .Louter: 256 $PTR_ADD $bi,$bp,$i 257 $LD $bi,($bi) 258 $LD $aj,($ap) 259 $LD $alo,$BNSZ($ap) 260 $LD $tj,($sp) 261 262 $MULTU $aj,$bi 263 $LD $nj,($np) 264 $LD $nlo,$BNSZ($np) 265 mflo $lo0 266 mfhi $hi0 267 $ADDU $lo0,$tj 268 $MULTU $lo0,$n0 269 sltu $at,$lo0,$tj 270 $ADDU $hi0,$at 271 mflo $m1 272 273 $MULTU $alo,$bi 274 mflo $alo 275 mfhi $ahi 276 277 $MULTU $nj,$m1 278 mflo $lo1 279 mfhi $hi1 280 281 $MULTU $nlo,$m1 282 $ADDU $lo1,$lo0 283 sltu $at,$lo1,$lo0 284 $ADDU $hi1,$at 285 mflo $nlo 286 mfhi $nhi 287 288 move $tp,$sp 289 li $j,2*$BNSZ 290 $LD $tj,$BNSZ($tp) 291 .align 4 292 .Linner: 293 .set noreorder 294 $PTR_ADD $aj,$ap,$j 295 $PTR_ADD $nj,$np,$j 296 $LD $aj,($aj) 297 $LD $nj,($nj) 298 299 $MULTU $aj,$bi 300 $ADDU $lo0,$alo,$hi0 301 $ADDU $lo1,$nlo,$hi1 302 sltu $at,$lo0,$hi0 303 sltu $t0,$lo1,$hi1 304 $ADDU $hi0,$ahi,$at 305 $ADDU $hi1,$nhi,$t0 306 mflo $alo 307 mfhi $ahi 308 309 $ADDU $lo0,$tj 310 addu $j,$BNSZ 311 $MULTU $nj,$m1 312 sltu $at,$lo0,$tj 313 $ADDU $lo1,$lo0 314 $ADDU $hi0,$at 315 sltu $t0,$lo1,$lo0 316 $LD $tj,2*$BNSZ($tp) 317 $ADDU $hi1,$t0 318 sltu $at,$j,$num 319 mflo $nlo 320 mfhi $nhi 321 $ST $lo1,($tp) 322 bnez $at,.Linner 323 $PTR_ADD $tp,$BNSZ 324 .set reorder 325 326 $ADDU $lo0,$alo,$hi0 327 sltu $at,$lo0,$hi0 328 $ADDU $hi0,$ahi,$at 329 $ADDU $lo0,$tj 330 sltu $t0,$lo0,$tj 331 $ADDU $hi0,$t0 332 333 $LD $tj,2*$BNSZ($tp) 334 $ADDU $lo1,$nlo,$hi1 335 sltu $at,$lo1,$hi1 336 $ADDU $hi1,$nhi,$at 337 $ADDU $lo1,$lo0 338 sltu $t0,$lo1,$lo0 339 $ADDU $hi1,$t0 340 $ST $lo1,($tp) 341 342 $ADDU $lo1,$hi1,$hi0 343 sltu $hi1,$lo1,$hi0 344 $ADDU $lo1,$tj 345 sltu $at,$lo1,$tj 346 $ADDU $hi1,$at 347 $ST $lo1,$BNSZ($tp) 348 $ST $hi1,2*$BNSZ($tp) 349 350 addu $i,$BNSZ 351 sltu $t0,$i,$num 352 bnez $t0,.Louter 353 355 .set noreorder 356 $PTR_ADD $tj,$sp,$num # &tp[num] 357 move $tp,$sp 358 move $ap,$sp 359 li $hi0,0 # clear borrow bit 360 361 .align 4 362 .Lsub: $LD $lo0,($tp) 363 $LD $lo1,($np) 364 $PTR_ADD $tp,$BNSZ 365 $PTR_ADD $np,$BNSZ 366 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] 367 sgtu $at,$lo1,$lo0 368 $SUBU $lo0,$lo1,$hi0 369 sgtu $hi0,$lo0,$lo1 370 $ST $lo0,($rp) 371 or $hi0,$at 372 sltu $at,$tp,$tj 373 bnez $at,.Lsub 374 $PTR_ADD $rp,$BNSZ 375 376 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit 377 move $tp,$sp 378 $PTR_SUB $rp,$num # restore rp 379 not $hi1,$hi0 380 381 and $ap,$hi0,$sp 382 and $bp,$hi1,$rp 383 or $ap,$ap,$bp # ap=borrow?tp:rp 384 385 .align 4 386 .Lcopy: $LD $aj,($ap) 387 $PTR_ADD $ap,$BNSZ 388 $ST $zero,($tp) 389 $PTR_ADD $tp,$BNSZ 390 sltu $at,$tp,$tj 391 $ST $aj,($rp) 392 bnez $at,.Lcopy 393 $PTR_ADD $rp,$BNSZ 394 395 li $a0,1 396 li $t0,1 397 398 .set noreorder 399 move $sp,$fp 400 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) 401 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) 402 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) 403 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) 404 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) 405 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) 406 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) 407 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) 408 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) 409 ___ 410 $code.=<<___ if ($flavour =~ /nubi/i); 411 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) 412 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) 413 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) 414 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) 415 ___ 416 $code.=<<___; 417 jr $ra 418 $PTR_ADD $sp,$FRAMESIZE*$SZREG 419 .end bn_mul_mont_internal 420 .rdata 421 .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 422 ___ 423 424 $code =~ s/\`([^\`]*)\`/eval $1/gem; 425 426 print $code; 427 close STDOUT; 428