1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for 11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler 12 # instructed to '-tune host' code with in-line assembler. Other 13 # benchmarks improve by 15-20%. To anchor it to something else, the 14 # code provides approximately the same performance per GHz as AMD64. 15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 16 # difference. 17 18 # int bn_mul_mont( 19 $rp="a0"; # BN_ULONG *rp, 20 $ap="a1"; # const BN_ULONG *ap, 21 $bp="a2"; # const BN_ULONG *bp, 22 $np="a3"; # const BN_ULONG *np, 23 $n0="a4"; # const BN_ULONG *n0, 24 $num="a5"; # int num); 25 26 $lo0="t0"; 27 $hi0="t1"; 28 $lo1="t2"; 29 $hi1="t3"; 30 $aj="t4"; 31 $bi="t5"; 32 $nj="t6"; 33 $tp="t7"; 34 $alo="t8"; 35 $ahi="t9"; 36 $nlo="t10"; 37 $nhi="t11"; 38 $tj="t12"; 39 $i="s3"; 40 $j="s4"; 41 $m1="s5"; 42 43 $code=<<___; 44 #ifdef __linux__ 45 #include <asm/regdef.h> 46 #else 47 #include <asm.h> 48 #include <regdef.h> 49 #endif 50 51 .text 52 53 .set noat 54 .set noreorder 55 56 .globl bn_mul_mont 57 .align 5 58 .ent bn_mul_mont 59 bn_mul_mont: 60 lda sp,-48(sp) 61 stq ra,0(sp) 62 stq s3,8(sp) 63 stq s4,16(sp) 64 stq s5,24(sp) 65 stq fp,32(sp) 66 mov sp,fp 67 .mask 0x0400f000,-48 68 .frame fp,48,ra 69 .prologue 0 70 71 .align 4 72 .set reorder 73 sextl $num,$num 74 mov 0,v0 75 cmplt $num,4,AT 76 bne AT,.Lexit 77 78 ldq $hi0,0($ap) # ap[0] 79 s8addq $num,16,AT 80 ldq $aj,8($ap) 81 subq sp,AT,sp 82 ldq $bi,0($bp) # bp[0] 83 lda AT,-4096(zero) # mov -4096,AT 84 ldq $n0,0($n0) 85 and sp,AT,sp 86 87 mulq $hi0,$bi,$lo0 88 ldq $hi1,0($np) # np[0] 89 umulh $hi0,$bi,$hi0 90 ldq $nj,8($np) 91 92 mulq $lo0,$n0,$m1 93 94 mulq $hi1,$m1,$lo1 95 umulh $hi1,$m1,$hi1 96 97 addq $lo1,$lo0,$lo1 98 cmpult $lo1,$lo0,AT 99 addq $hi1,AT,$hi1 100 101 mulq $aj,$bi,$alo 102 mov 2,$j 103 umulh $aj,$bi,$ahi 104 mov sp,$tp 105 106 mulq $nj,$m1,$nlo 107 s8addq $j,$ap,$aj 108 umulh $nj,$m1,$nhi 109 s8addq $j,$np,$nj 110 .align 4 111 .L1st: 112 .set noreorder 113 ldq $aj,0($aj) 114 addl $j,1,$j 115 ldq $nj,0($nj) 116 lda $tp,8($tp) 117 118 addq $alo,$hi0,$lo0 119 mulq $aj,$bi,$alo 120 cmpult $lo0,$hi0,AT 121 addq $nlo,$hi1,$lo1 122 123 mulq $nj,$m1,$nlo 124 addq $ahi,AT,$hi0 125 cmpult $lo1,$hi1,v0 126 cmplt $j,$num,$tj 127 128 umulh $aj,$bi,$ahi 129 addq $nhi,v0,$hi1 130 addq $lo1,$lo0,$lo1 131 s8addq $j,$ap,$aj 132 133 umulh $nj,$m1,$nhi 134 cmpult $lo1,$lo0,v0 135 addq $hi1,v0,$hi1 136 s8addq $j,$np,$nj 137 138 stq $lo1,-8($tp) 139 nop 140 unop 141 bne $tj,.L1st 142 .set reorder 143 144 addq $alo,$hi0,$lo0 145 addq $nlo,$hi1,$lo1 146 cmpult $lo0,$hi0,AT 147 cmpult $lo1,$hi1,v0 148 addq $ahi,AT,$hi0 149 addq $nhi,v0,$hi1 150 151 addq $lo1,$lo0,$lo1 152 cmpult $lo1,$lo0,v0 153 addq $hi1,v0,$hi1 154 155 stq $lo1,0($tp) 156 157 addq $hi1,$hi0,$hi1 158 cmpult $hi1,$hi0,AT 159 stq $hi1,8($tp) 160 stq AT,16($tp) 161 162 mov 1,$i 163 .align 4 164 .Louter: 165 s8addq $i,$bp,$bi 166 ldq $hi0,0($ap) 167 ldq $aj,8($ap) 168 ldq $bi,0($bi) 169 ldq $hi1,0($np) 170 ldq $nj,8($np) 171 ldq $tj,0(sp) 172 173 mulq $hi0,$bi,$lo0 174 umulh $hi0,$bi,$hi0 175 176 addq $lo0,$tj,$lo0 177 cmpult $lo0,$tj,AT 178 addq $hi0,AT,$hi0 179 180 mulq $lo0,$n0,$m1 181 182 mulq $hi1,$m1,$lo1 183 umulh $hi1,$m1,$hi1 184 185 addq $lo1,$lo0,$lo1 186 cmpult $lo1,$lo0,AT 187 mov 2,$j 188 addq $hi1,AT,$hi1 189 190 mulq $aj,$bi,$alo 191 mov sp,$tp 192 umulh $aj,$bi,$ahi 193 194 mulq $nj,$m1,$nlo 195 s8addq $j,$ap,$aj 196 umulh $nj,$m1,$nhi 197 .align 4 198 .Linner: 199 .set noreorder 200 ldq $tj,8($tp) #L0 201 nop #U1 202 ldq $aj,0($aj) #L1 203 s8addq $j,$np,$nj #U0 204 205 ldq $nj,0($nj) #L0 206 nop #U1 207 addq $alo,$hi0,$lo0 #L1 208 lda $tp,8($tp) 209 210 mulq $aj,$bi,$alo #U1 211 cmpult $lo0,$hi0,AT #L0 212 addq $nlo,$hi1,$lo1 #L1 213 addl $j,1,$j 214 215 mulq $nj,$m1,$nlo #U1 216 addq $ahi,AT,$hi0 #L0 217 addq $lo0,$tj,$lo0 #L1 218 cmpult $lo1,$hi1,v0 #U0 219 220 umulh $aj,$bi,$ahi #U1 221 cmpult $lo0,$tj,AT #L0 222 addq $lo1,$lo0,$lo1 #L1 223 addq $nhi,v0,$hi1 #U0 224 225 umulh $nj,$m1,$nhi #U1 226 s8addq $j,$ap,$aj #L0 227 cmpult $lo1,$lo0,v0 #L1 228 cmplt $j,$num,$tj #U0 # borrow $tj 229 230 addq $hi0,AT,$hi0 #L0 231 addq $hi1,v0,$hi1 #U1 232 stq $lo1,-8($tp) #L1 233 bne $tj,.Linner #U0 234 .set reorder 235 236 ldq $tj,8($tp) 237 addq $alo,$hi0,$lo0 238 addq $nlo,$hi1,$lo1 239 cmpult $lo0,$hi0,AT 240 cmpult $lo1,$hi1,v0 241 addq $ahi,AT,$hi0 242 addq $nhi,v0,$hi1 243 244 addq $lo0,$tj,$lo0 245 cmpult $lo0,$tj,AT 246 addq $hi0,AT,$hi0 247 248 ldq $tj,16($tp) 249 addq $lo1,$lo0,$j 250 cmpult $j,$lo0,v0 251 addq $hi1,v0,$hi1 252 253 addq $hi1,$hi0,$lo1 254 stq $j,0($tp) 255 cmpult $lo1,$hi0,$hi1 256 addq $lo1,$tj,$lo1 257 cmpult $lo1,$tj,AT 258 addl $i,1,$i 259 addq $hi1,AT,$hi1 260 stq $lo1,8($tp) 261 cmplt $i,$num,$tj # borrow $tj 262 stq $hi1,16($tp) 263 bne $tj,.Louter 264 266 s8addq $num,sp,$tj # &tp[num] 267 mov $rp,$bp # put rp aside 268 mov sp,$tp 269 mov sp,$ap 270 mov 0,$hi0 # clear borrow bit 271 272 .align 4 273 .Lsub: ldq $lo0,0($tp) 274 ldq $lo1,0($np) 275 lda $tp,8($tp) 276 lda $np,8($np) 277 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 278 cmpult $lo0,$lo1,AT 279 subq $lo1,$hi0,$lo0 280 cmpult $lo1,$lo0,$hi0 281 or $hi0,AT,$hi0 282 stq $lo0,0($rp) 283 cmpult $tp,$tj,v0 284 lda $rp,8($rp) 285 bne v0,.Lsub 286 287 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 288 mov sp,$tp 289 mov $bp,$rp # restore rp 290 291 and sp,$hi0,$ap 292 bic $bp,$hi0,$bp 293 bis $bp,$ap,$ap # ap=borrow?tp:rp 294 295 .align 4 296 .Lcopy: ldq $aj,0($ap) # copy or in-place refresh 297 lda $tp,8($tp) 298 lda $rp,8($rp) 299 lda $ap,8($ap) 300 stq zero,-8($tp) # zap tp 301 cmpult $tp,$tj,AT 302 stq $aj,-8($rp) 303 bne AT,.Lcopy 304 mov 1,v0 305 306 .Lexit: 307 .set noreorder 308 mov fp,sp 309 /*ldq ra,0(sp)*/ 310 ldq s3,8(sp) 311 ldq s4,16(sp) 312 ldq s5,24(sp) 313 ldq fp,32(sp) 314 lda sp,48(sp) 315 ret (ra) 316 .end bn_mul_mont 317 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 318 .align 2 319 ___ 320 321 print $code; 322 close STDOUT; 323