1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # October 2005 11 # 12 # This is a "teaser" code, as it can be improved in several ways... 13 # First of all non-SSE2 path should be implemented (yes, for now it 14 # performs Montgomery multiplication/convolution only on SSE2-capable 15 # CPUs such as P4, others fall down to original code). Then inner loop 16 # can be unrolled and modulo-scheduled to improve ILP and possibly 17 # moved to 128-bit XMM register bank (though it would require input 18 # rearrangement and/or increase bus bandwidth utilization). Dedicated 19 # squaring procedure should give further performance improvement... 20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by 21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 22 23 # December 2006 24 # 25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement. 26 # Integer-only code [being equipped with dedicated squaring procedure] 27 # gives ~40% on rsa512 sign benchmark... 28 29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30 push(@INC,"${dir}","${dir}../../perlasm"); 31 require "x86asm.pl"; 32 33 &asm_init($ARGV[0],$0); 34 35 $sse2=0; 36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 37 38 &external_label("OPENSSL_ia32cap_P") if ($sse2); 39 40 &function_begin("bn_mul_mont"); 41 42 $i="edx"; 43 $j="ecx"; 44 $ap="esi"; $tp="esi"; # overlapping variables!!! 45 $rp="edi"; $bp="edi"; # overlapping variables!!! 46 $np="ebp"; 47 $num="ebx"; 48 49 $_num=&DWP(4*0,"esp"); # stack top layout 50 $_rp=&DWP(4*1,"esp"); 51 $_ap=&DWP(4*2,"esp"); 52 $_bp=&DWP(4*3,"esp"); 53 $_np=&DWP(4*4,"esp"); 54 $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 55 $_sp=&DWP(4*6,"esp"); 56 $_bpend=&DWP(4*7,"esp"); 57 $frame=32; # size of above frame rounded up to 16n 58 59 &xor ("eax","eax"); 60 &mov ("edi",&wparam(5)); # int num 61 &cmp ("edi",4); 62 &jl (&label("just_leave")); 63 64 &lea ("esi",&wparam(0)); # put aside pointer to argument block 65 &lea ("edx",&wparam(1)); # load ap 66 &mov ("ebp","esp"); # saved stack pointer! 67 &add ("edi",2); # extra two words on top of tp 68 &neg ("edi"); 69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) 70 &neg ("edi"); 71 72 # minimize cache contention by arraning 2K window between stack 73 # pointer and ap argument [np is also position sensitive vector, 74 # but it's assumed to be near ap, as it's allocated at ~same 75 # time]. 76 &mov ("eax","esp"); 77 &sub ("eax","edx"); 78 &and ("eax",2047); 79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048 80 81 &xor ("edx","esp"); 82 &and ("edx",2048); 83 &xor ("edx",2048); 84 &sub ("esp","edx"); # this splits them apart modulo 4096 85 86 &and ("esp",-64); # align to cache line 87 88 ################################# load argument block... 89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np 93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 94 #&mov ("edi",&DWP(5*4,"esi"));# int num 95 96 &mov ("esi",&DWP(0,"esi")); # pull n0[0] 97 &mov ($_rp,"eax"); # ... save a copy of argument block 98 &mov ($_ap,"ebx"); 99 &mov ($_bp,"ecx"); 100 &mov ($_np,"edx"); 101 &mov ($_n0,"esi"); 102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 103 #&mov ($_num,$num); # redundant as $num is not reused 104 &mov ($_sp,"ebp"); # saved stack pointer! 105 107 if($sse2) { 108 $acc0="mm0"; # mmx register bank layout 109 $acc1="mm1"; 110 $car0="mm2"; 111 $car1="mm3"; 112 $mul0="mm4"; 113 $mul1="mm5"; 114 $temp="mm6"; 115 $mask="mm7"; 116 117 &picmeup("eax","OPENSSL_ia32cap_P"); 118 &bt (&DWP(0,"eax"),26); 119 &jnc (&label("non_sse2")); 120 121 &mov ("eax",-1); 122 &movd ($mask,"eax"); # mask 32 lower bits 123 124 &mov ($ap,$_ap); # load input pointers 125 &mov ($bp,$_bp); 126 &mov ($np,$_np); 127 128 &xor ($i,$i); # i=0 129 &xor ($j,$j); # j=0 130 131 &movd ($mul0,&DWP(0,$bp)); # bp[0] 132 &movd ($mul1,&DWP(0,$ap)); # ap[0] 133 &movd ($car1,&DWP(0,$np)); # np[0] 134 135 &pmuludq($mul1,$mul0); # ap[0]*bp[0] 136 &movq ($car0,$mul1); 137 &movq ($acc0,$mul1); # I wish movd worked for 138 &pand ($acc0,$mask); # inter-register transfers 139 140 &pmuludq($mul1,$_n0q); # *=n0 141 142 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 143 &paddq ($car1,$acc0); 144 145 &movd ($acc1,&DWP(4,$np)); # np[1] 146 &movd ($acc0,&DWP(4,$ap)); # ap[1] 147 148 &psrlq ($car0,32); 149 &psrlq ($car1,32); 150 151 &inc ($j); # j++ 152 &set_label("1st",16); 153 &pmuludq($acc0,$mul0); # ap[j]*bp[0] 154 &pmuludq($acc1,$mul1); # np[j]*m1 155 &paddq ($car0,$acc0); # +=c0 156 &paddq ($car1,$acc1); # +=c1 157 158 &movq ($acc0,$car0); 159 &pand ($acc0,$mask); 160 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 161 &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 162 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 163 &psrlq ($car0,32); 164 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 165 &psrlq ($car1,32); 166 167 &lea ($j,&DWP(1,$j)); 168 &cmp ($j,$num); 169 &jl (&label("1st")); 170 171 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 172 &pmuludq($acc1,$mul1); # np[num-1]*m1 173 &paddq ($car0,$acc0); # +=c0 174 &paddq ($car1,$acc1); # +=c1 175 176 &movq ($acc0,$car0); 177 &pand ($acc0,$mask); 178 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 179 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 180 181 &psrlq ($car0,32); 182 &psrlq ($car1,32); 183 184 &paddq ($car1,$car0); 185 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 186 188 &inc ($i); # i++ 189 &set_label("outer"); 190 &xor ($j,$j); # j=0 191 192 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 193 &movd ($mul1,&DWP(0,$ap)); # ap[0] 194 &movd ($temp,&DWP($frame,"esp")); # tp[0] 195 &movd ($car1,&DWP(0,$np)); # np[0] 196 &pmuludq($mul1,$mul0); # ap[0]*bp[i] 197 198 &paddq ($mul1,$temp); # +=tp[0] 199 &movq ($acc0,$mul1); 200 &movq ($car0,$mul1); 201 &pand ($acc0,$mask); 202 203 &pmuludq($mul1,$_n0q); # *=n0 204 205 &pmuludq($car1,$mul1); 206 &paddq ($car1,$acc0); 207 208 &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 209 &movd ($acc1,&DWP(4,$np)); # np[1] 210 &movd ($acc0,&DWP(4,$ap)); # ap[1] 211 212 &psrlq ($car0,32); 213 &psrlq ($car1,32); 214 &paddq ($car0,$temp); # +=tp[1] 215 216 &inc ($j); # j++ 217 &dec ($num); 218 &set_label("inner"); 219 &pmuludq($acc0,$mul0); # ap[j]*bp[i] 220 &pmuludq($acc1,$mul1); # np[j]*m1 221 &paddq ($car0,$acc0); # +=c0 222 &paddq ($car1,$acc1); # +=c1 223 224 &movq ($acc0,$car0); 225 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 226 &pand ($acc0,$mask); 227 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 228 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 229 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 230 &psrlq ($car0,32); 231 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 232 &psrlq ($car1,32); 233 &paddq ($car0,$temp); # +=tp[j+1] 234 235 &dec ($num); 236 &lea ($j,&DWP(1,$j)); # j++ 237 &jnz (&label("inner")); 238 239 &mov ($num,$j); 240 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 241 &pmuludq($acc1,$mul1); # np[num-1]*m1 242 &paddq ($car0,$acc0); # +=c0 243 &paddq ($car1,$acc1); # +=c1 244 245 &movq ($acc0,$car0); 246 &pand ($acc0,$mask); 247 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 248 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 249 &psrlq ($car0,32); 250 &psrlq ($car1,32); 251 252 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 253 &paddq ($car1,$car0); 254 &paddq ($car1,$temp); 255 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 256 257 &lea ($i,&DWP(1,$i)); # i++ 258 &cmp ($i,$num); 259 &jle (&label("outer")); 260 261 &emms (); # done with mmx bank 262 &jmp (&label("common_tail")); 263 264 &set_label("non_sse2",16); 265 } 266 268 if (0) { 269 &mov ("esp",$_sp); 270 &xor ("eax","eax"); # signal "not fast enough [yet]" 271 &jmp (&label("just_leave")); 272 # While the below code provides competitive performance for 273 # all key lengthes on modern Intel cores, it's still more 274 # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 275 # means compared to the original integer-only assembler. 276 # 512-bit RSA sign is better by ~40%, but that's about all 277 # one can say about all CPUs... 278 } else { 279 $inp="esi"; # integer path uses these registers differently 280 $word="edi"; 281 $carry="ebp"; 282 283 &mov ($inp,$_ap); 284 &lea ($carry,&DWP(1,$num)); 285 &mov ($word,$_bp); 286 &xor ($j,$j); # j=0 287 &mov ("edx",$inp); 288 &and ($carry,1); # see if num is even 289 &sub ("edx",$word); # see if ap==bp 290 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 291 &or ($carry,"edx"); 292 &mov ($word,&DWP(0,$word)); # bp[0] 293 &jz (&label("bn_sqr_mont")); 294 &mov ($_bpend,"eax"); 295 &mov ("eax",&DWP(0,$inp)); 296 &xor ("edx","edx"); 297 298 &set_label("mull",16); 299 &mov ($carry,"edx"); 300 &mul ($word); # ap[j]*bp[0] 301 &add ($carry,"eax"); 302 &lea ($j,&DWP(1,$j)); 303 &adc ("edx",0); 304 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 305 &cmp ($j,$num); 306 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 307 &jl (&label("mull")); 308 309 &mov ($carry,"edx"); 310 &mul ($word); # ap[num-1]*bp[0] 311 &mov ($word,$_n0); 312 &add ("eax",$carry); 313 &mov ($inp,$_np); 314 &adc ("edx",0); 315 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 316 317 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 318 &xor ($j,$j); 319 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 320 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 321 322 &mov ("eax",&DWP(0,$inp)); # np[0] 323 &mul ($word); # np[0]*m 324 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 325 &mov ("eax",&DWP(4,$inp)); # np[1] 326 &adc ("edx",0); 327 &inc ($j); 328 329 &jmp (&label("2ndmadd")); 330 333 &set_label("1stmadd",16); 334 &mov ($carry,"edx"); 335 &mul ($word); # ap[j]*bp[i] 336 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 337 &lea ($j,&DWP(1,$j)); 338 &adc ("edx",0); 339 &add ($carry,"eax"); 340 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 341 &adc ("edx",0); 342 &cmp ($j,$num); 343 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 344 &jl (&label("1stmadd")); 345 346 &mov ($carry,"edx"); 347 &mul ($word); # ap[num-1]*bp[i] 348 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 349 &mov ($word,$_n0); 350 &adc ("edx",0); 351 &mov ($inp,$_np); 352 &add ($carry,"eax"); 353 &adc ("edx",0); 354 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 355 356 &xor ($j,$j); 357 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 358 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 359 &adc ($j,0); 360 &mov ("eax",&DWP(0,$inp)); # np[0] 361 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 362 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 363 364 &mul ($word); # np[0]*m 365 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 366 &mov ("eax",&DWP(4,$inp)); # np[1] 367 &adc ("edx",0); 368 &mov ($j,1); 369 371 &set_label("2ndmadd",16); 372 &mov ($carry,"edx"); 373 &mul ($word); # np[j]*m 374 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 375 &lea ($j,&DWP(1,$j)); 376 &adc ("edx",0); 377 &add ($carry,"eax"); 378 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 379 &adc ("edx",0); 380 &cmp ($j,$num); 381 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 382 &jl (&label("2ndmadd")); 383 384 &mov ($carry,"edx"); 385 &mul ($word); # np[j]*m 386 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 387 &adc ("edx",0); 388 &add ($carry,"eax"); 389 &adc ("edx",0); 390 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 391 392 &xor ("eax","eax"); 393 &mov ($j,$_bp); # &bp[i] 394 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 395 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 396 &lea ($j,&DWP(4,$j)); 397 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 398 &cmp ($j,$_bpend); 399 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 400 &je (&label("common_tail")); 401 402 &mov ($word,&DWP(0,$j)); # bp[i+1] 403 &mov ($inp,$_ap); 404 &mov ($_bp,$j); # &bp[++i] 405 &xor ($j,$j); 406 &xor ("edx","edx"); 407 &mov ("eax",&DWP(0,$inp)); 408 &jmp (&label("1stmadd")); 409 411 &set_label("bn_sqr_mont",16); 412 $sbit=$num; 413 &mov ($_num,$num); 414 &mov ($_bp,$j); # i=0 415 416 &mov ("eax",$word); # ap[0] 417 &mul ($word); # ap[0]*ap[0] 418 &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 419 &mov ($sbit,"edx"); 420 &shr ("edx",1); 421 &and ($sbit,1); 422 &inc ($j); 423 &set_label("sqr",16); 424 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 425 &mov ($carry,"edx"); 426 &mul ($word); # ap[j]*ap[0] 427 &add ("eax",$carry); 428 &lea ($j,&DWP(1,$j)); 429 &adc ("edx",0); 430 &lea ($carry,&DWP(0,$sbit,"eax",2)); 431 &shr ("eax",31); 432 &cmp ($j,$_num); 433 &mov ($sbit,"eax"); 434 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 435 &jl (&label("sqr")); 436 437 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 438 &mov ($carry,"edx"); 439 &mul ($word); # ap[num-1]*ap[0] 440 &add ("eax",$carry); 441 &mov ($word,$_n0); 442 &adc ("edx",0); 443 &mov ($inp,$_np); 444 &lea ($carry,&DWP(0,$sbit,"eax",2)); 445 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 446 &shr ("eax",31); 447 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 448 449 &lea ($carry,&DWP(0,"eax","edx",2)); 450 &mov ("eax",&DWP(0,$inp)); # np[0] 451 &shr ("edx",31); 452 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 453 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 454 455 &mul ($word); # np[0]*m 456 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 457 &mov ($num,$j); 458 &adc ("edx",0); 459 &mov ("eax",&DWP(4,$inp)); # np[1] 460 &mov ($j,1); 461 464 &set_label("3rdmadd",16); 465 &mov ($carry,"edx"); 466 &mul ($word); # np[j]*m 467 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 468 &adc ("edx",0); 469 &add ($carry,"eax"); 470 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 471 &adc ("edx",0); 472 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 473 474 &mov ($carry,"edx"); 475 &mul ($word); # np[j+1]*m 476 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 477 &lea ($j,&DWP(2,$j)); 478 &adc ("edx",0); 479 &add ($carry,"eax"); 480 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 481 &adc ("edx",0); 482 &cmp ($j,$num); 483 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 484 &jl (&label("3rdmadd")); 485 486 &mov ($carry,"edx"); 487 &mul ($word); # np[j]*m 488 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 489 &adc ("edx",0); 490 &add ($carry,"eax"); 491 &adc ("edx",0); 492 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 493 494 &mov ($j,$_bp); # i 495 &xor ("eax","eax"); 496 &mov ($inp,$_ap); 497 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 498 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 499 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 500 &cmp ($j,$num); 501 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 502 &je (&label("common_tail")); 503 505 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 506 &lea ($j,&DWP(1,$j)); 507 &mov ("eax",$word); 508 &mov ($_bp,$j); # ++i 509 &mul ($word); # ap[i]*ap[i] 510 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 511 &adc ("edx",0); 512 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 513 &xor ($carry,$carry); 514 &cmp ($j,$num); 515 &lea ($j,&DWP(1,$j)); 516 &je (&label("sqrlast")); 517 518 &mov ($sbit,"edx"); # zaps $num 519 &shr ("edx",1); 520 &and ($sbit,1); 521 &set_label("sqradd",16); 522 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 523 &mov ($carry,"edx"); 524 &mul ($word); # ap[j]*ap[i] 525 &add ("eax",$carry); 526 &lea ($carry,&DWP(0,"eax","eax")); 527 &adc ("edx",0); 528 &shr ("eax",31); 529 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 530 &lea ($j,&DWP(1,$j)); 531 &adc ("eax",0); 532 &add ($carry,$sbit); 533 &adc ("eax",0); 534 &cmp ($j,$_num); 535 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 536 &mov ($sbit,"eax"); 537 &jle (&label("sqradd")); 538 539 &mov ($carry,"edx"); 540 &add ("edx","edx"); 541 &shr ($carry,31); 542 &add ("edx",$sbit); 543 &adc ($carry,0); 544 &set_label("sqrlast"); 545 &mov ($word,$_n0); 546 &mov ($inp,$_np); 547 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 548 549 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 550 &mov ("eax",&DWP(0,$inp)); # np[0] 551 &adc ($carry,0); 552 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 553 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 554 555 &mul ($word); # np[0]*m 556 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 557 &lea ($num,&DWP(-1,$j)); 558 &adc ("edx",0); 559 &mov ($j,1); 560 &mov ("eax",&DWP(4,$inp)); # np[1] 561 562 &jmp (&label("3rdmadd")); 563 } 564 566 &set_label("common_tail",16); 567 &mov ($np,$_np); # load modulus pointer 568 &mov ($rp,$_rp); # load result pointer 569 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 570 571 &mov ("eax",&DWP(0,$tp)); # tp[0] 572 &mov ($j,$num); # j=num-1 573 &xor ($i,$i); # i=0 and clear CF! 574 575 &set_label("sub",16); 576 &sbb ("eax",&DWP(0,$np,$i,4)); 577 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 578 &dec ($j); # doesn't affect CF! 579 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 580 &lea ($i,&DWP(1,$i)); # i++ 581 &jge (&label("sub")); 582 583 &sbb ("eax",0); # handle upmost overflow bit 584 585 &set_label("copy",16); # copy or in-place refresh 586 &mov ("edx",&DWP(0,$tp,$num,4)); 587 &mov ($np,&DWP(0,$rp,$num,4)); 588 &xor ("edx",$np); # conditional select 589 &and ("edx","eax"); 590 &xor ("edx",$np); 591 &mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector 592 &mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i] 593 &dec ($num); 594 &jge (&label("copy")); 595 596 &mov ("esp",$_sp); # pull saved stack pointer 597 &mov ("eax",1); 598 &set_label("just_leave"); 599 &function_end("bn_mul_mont"); 600 601 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 602 603 &asm_finish(); 604