Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # October 2005.
     11 #
     12 # Montgomery multiplication routine for x86_64. While it gives modest
     13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
     14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
     15 # respectful 50%. It remains to be seen if loop unrolling and
     16 # dedicated squaring routine can provide further improvement...
     17 
     18 # July 2011.
     19 #
     20 # Add dedicated squaring procedure. Performance improvement varies
     21 # from platform to platform, but in average it's ~5%/15%/25%/33%
     22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     23 
     24 # August 2011.
     25 #
     26 # Unroll and modulo-schedule inner loops in such manner that they
     27 # are "fallen through" for input lengths of 8, which is critical for
     28 # 1024-bit RSA *sign*. Average performance improvement in comparison
     29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
     30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     31 
     32 # June 2013.
     33 #
     34 # Optimize reduction in squaring procedure and improve 1024+-bit RSA
     35 # sign performance by 10-16% on Intel Sandy Bridge and later
     36 # (virtually same on non-Intel processors).
     37 
     38 # August 2013.
     39 #
     40 # Add MULX/ADOX/ADCX code path.
     41 
     42 $flavour = shift;
     43 $output  = shift;
     44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     45 
     46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     47 
     48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     51 die "can't locate x86_64-xlate.pl";
     52 
     53 open OUT,"| \"$^X\" $xlate $flavour $output";
     54 *STDOUT=*OUT;
     55 
     56 # In upstream, this is controlled by shelling out to the compiler to check
     57 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     58 # output, so this isn't useful anyway.
     59 #
     60 # TODO(davidben): Enable this option after testing. $addx goes up to 1.
     61 $addx = 0;
     62 
     63 # int bn_mul_mont(
     64 $rp="%rdi";	# BN_ULONG *rp,
     65 $ap="%rsi";	# const BN_ULONG *ap,
     66 $bp="%rdx";	# const BN_ULONG *bp,
     67 $np="%rcx";	# const BN_ULONG *np,
     68 $n0="%r8";	# const BN_ULONG *n0,
     69 $num="%r9";	# int num);
     70 $lo0="%r10";
     71 $hi0="%r11";
     72 $hi1="%r13";
     73 $i="%r14";
     74 $j="%r15";
     75 $m0="%rbx";
     76 $m1="%rbp";
     77 
     78 $code=<<___;
     79 .text
     80 
     81 .extern	OPENSSL_ia32cap_P
     82 
     83 .globl	bn_mul_mont
     84 .type	bn_mul_mont,\@function,6
     85 .align	16
     86 bn_mul_mont:
     87 	test	\$3,${num}d
     88 	jnz	.Lmul_enter
     89 	cmp	\$8,${num}d
     90 	jb	.Lmul_enter
     91 ___
     92 $code.=<<___ if ($addx);
     93 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
     94 ___
     95 $code.=<<___;
     96 	cmp	$ap,$bp
     97 	jne	.Lmul4x_enter
     98 	test	\$7,${num}d
     99 	jz	.Lsqr8x_enter
    100 	jmp	.Lmul4x_enter
    101 
    102 .align	16
    103 .Lmul_enter:
    104 	push	%rbx
    105 	push	%rbp
    106 	push	%r12
    107 	push	%r13
    108 	push	%r14
    109 	push	%r15
    110 
    111 	mov	${num}d,${num}d
    112 	lea	2($num),%r10
    113 	mov	%rsp,%r11
    114 	neg	%r10
    115 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
    116 	and	\$-1024,%rsp		# minimize TLB usage
    117 
    118 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
    119 .Lmul_body:
    120 	mov	$bp,%r12		# reassign $bp
    121 ___
    122 		$bp="%r12";
    123 $code.=<<___;
    124 	mov	($n0),$n0		# pull n0[0] value
    125 	mov	($bp),$m0		# m0=bp[0]
    126 	mov	($ap),%rax
    127 
    128 	xor	$i,$i			# i=0
    129 	xor	$j,$j			# j=0
    130 
    131 	mov	$n0,$m1
    132 	mulq	$m0			# ap[0]*bp[0]
    133 	mov	%rax,$lo0
    134 	mov	($np),%rax
    135 
    136 	imulq	$lo0,$m1		# "tp[0]"*n0
    137 	mov	%rdx,$hi0
    138 
    139 	mulq	$m1			# np[0]*m1
    140 	add	%rax,$lo0		# discarded
    141 	mov	8($ap),%rax
    142 	adc	\$0,%rdx
    143 	mov	%rdx,$hi1
    144 
    145 	lea	1($j),$j		# j++
    146 	jmp	.L1st_enter
    147 
    148 .align	16
    149 .L1st:
    150 	add	%rax,$hi1
    151 	mov	($ap,$j,8),%rax
    152 	adc	\$0,%rdx
    153 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    154 	mov	$lo0,$hi0
    155 	adc	\$0,%rdx
    156 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    157 	mov	%rdx,$hi1
    158 
    159 .L1st_enter:
    160 	mulq	$m0			# ap[j]*bp[0]
    161 	add	%rax,$hi0
    162 	mov	($np,$j,8),%rax
    163 	adc	\$0,%rdx
    164 	lea	1($j),$j		# j++
    165 	mov	%rdx,$lo0
    166 
    167 	mulq	$m1			# np[j]*m1
    168 	cmp	$num,$j
    169 	jne	.L1st
    170 
    171 	add	%rax,$hi1
    172 	mov	($ap),%rax		# ap[0]
    173 	adc	\$0,%rdx
    174 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    175 	adc	\$0,%rdx
    176 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    177 	mov	%rdx,$hi1
    178 	mov	$lo0,$hi0
    179 
    180 	xor	%rdx,%rdx
    181 	add	$hi0,$hi1
    182 	adc	\$0,%rdx
    183 	mov	$hi1,-8(%rsp,$num,8)
    184 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    185 
    186 	lea	1($i),$i		# i++
    187 	jmp	.Louter
    188 .align	16
    189 .Louter:
    190 	mov	($bp,$i,8),$m0		# m0=bp[i]
    191 	xor	$j,$j			# j=0
    192 	mov	$n0,$m1
    193 	mov	(%rsp),$lo0
    194 	mulq	$m0			# ap[0]*bp[i]
    195 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    196 	mov	($np),%rax
    197 	adc	\$0,%rdx
    198 
    199 	imulq	$lo0,$m1		# tp[0]*n0
    200 	mov	%rdx,$hi0
    201 
    202 	mulq	$m1			# np[0]*m1
    203 	add	%rax,$lo0		# discarded
    204 	mov	8($ap),%rax
    205 	adc	\$0,%rdx
    206 	mov	8(%rsp),$lo0		# tp[1]
    207 	mov	%rdx,$hi1
    208 
    209 	lea	1($j),$j		# j++
    210 	jmp	.Linner_enter
    211 
    212 .align	16
    213 .Linner:
    214 	add	%rax,$hi1
    215 	mov	($ap,$j,8),%rax
    216 	adc	\$0,%rdx
    217 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    218 	mov	(%rsp,$j,8),$lo0
    219 	adc	\$0,%rdx
    220 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    221 	mov	%rdx,$hi1
    222 
    223 .Linner_enter:
    224 	mulq	$m0			# ap[j]*bp[i]
    225 	add	%rax,$hi0
    226 	mov	($np,$j,8),%rax
    227 	adc	\$0,%rdx
    228 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    229 	mov	%rdx,$hi0
    230 	adc	\$0,$hi0
    231 	lea	1($j),$j		# j++
    232 
    233 	mulq	$m1			# np[j]*m1
    234 	cmp	$num,$j
    235 	jne	.Linner
    236 
    237 	add	%rax,$hi1
    238 	mov	($ap),%rax		# ap[0]
    239 	adc	\$0,%rdx
    240 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    241 	mov	(%rsp,$j,8),$lo0
    242 	adc	\$0,%rdx
    243 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    244 	mov	%rdx,$hi1
    245 
    246 	xor	%rdx,%rdx
    247 	add	$hi0,$hi1
    248 	adc	\$0,%rdx
    249 	add	$lo0,$hi1		# pull upmost overflow bit
    250 	adc	\$0,%rdx
    251 	mov	$hi1,-8(%rsp,$num,8)
    252 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    253 
    254 	lea	1($i),$i		# i++
    255 	cmp	$num,$i
    256 	jb	.Louter
    257 
    258 	xor	$i,$i			# i=0 and clear CF!
    259 	mov	(%rsp),%rax		# tp[0]
    260 	lea	(%rsp),$ap		# borrow ap for tp
    261 	mov	$num,$j			# j=num
    262 	jmp	.Lsub
    263 .align	16
    264 .Lsub:	sbb	($np,$i,8),%rax
    265 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    266 	mov	8($ap,$i,8),%rax	# tp[i+1]
    267 	lea	1($i),$i		# i++
    268 	dec	$j			# doesn't affect CF!
    269 	jnz	.Lsub
    270 
    271 	sbb	\$0,%rax		# handle upmost overflow bit
    272 	xor	$i,$i
    273 	mov	$num,$j			# j=num
    274 .align	16
    275 .Lcopy:					# copy or in-place refresh
    276 	mov	(%rsp,$i,8),$ap
    277 	mov	($rp,$i,8),$np
    278 	xor	$np,$ap			# conditional select:
    279 	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
    280 	xor	$np,$ap			# ap = borrow?tp:rp
    281 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    282 	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
    283 	lea	1($i),$i
    284 	sub	\$1,$j
    285 	jnz	.Lcopy
    286 
    287 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    288 	mov	\$1,%rax
    289 	mov	(%rsi),%r15
    290 	mov	8(%rsi),%r14
    291 	mov	16(%rsi),%r13
    292 	mov	24(%rsi),%r12
    293 	mov	32(%rsi),%rbp
    294 	mov	40(%rsi),%rbx
    295 	lea	48(%rsi),%rsp
    296 .Lmul_epilogue:
    297 	ret
    298 .size	bn_mul_mont,.-bn_mul_mont
    299 ___
    300 {{{
    301 my @A=("%r10","%r11");
    302 my @N=("%r13","%rdi");
    303 $code.=<<___;
    304 .type	bn_mul4x_mont,\@function,6
    305 .align	16
    306 bn_mul4x_mont:
    307 .Lmul4x_enter:
    308 ___
    309 $code.=<<___ if ($addx);
    310 	and	\$0x80100,%r11d
    311 	cmp	\$0x80100,%r11d
    312 	je	.Lmulx4x_enter
    313 ___
    314 $code.=<<___;
    315 	push	%rbx
    316 	push	%rbp
    317 	push	%r12
    318 	push	%r13
    319 	push	%r14
    320 	push	%r15
    321 
    322 	mov	${num}d,${num}d
    323 	lea	4($num),%r10
    324 	mov	%rsp,%r11
    325 	neg	%r10
    326 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
    327 	and	\$-1024,%rsp		# minimize TLB usage
    328 
    329 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
    330 .Lmul4x_body:
    331 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    332 	mov	%rdx,%r12		# reassign $bp
    333 ___
    334 		$bp="%r12";
    335 $code.=<<___;
    336 	mov	($n0),$n0		# pull n0[0] value
    337 	mov	($bp),$m0		# m0=bp[0]
    338 	mov	($ap),%rax
    339 
    340 	xor	$i,$i			# i=0
    341 	xor	$j,$j			# j=0
    342 
    343 	mov	$n0,$m1
    344 	mulq	$m0			# ap[0]*bp[0]
    345 	mov	%rax,$A[0]
    346 	mov	($np),%rax
    347 
    348 	imulq	$A[0],$m1		# "tp[0]"*n0
    349 	mov	%rdx,$A[1]
    350 
    351 	mulq	$m1			# np[0]*m1
    352 	add	%rax,$A[0]		# discarded
    353 	mov	8($ap),%rax
    354 	adc	\$0,%rdx
    355 	mov	%rdx,$N[1]
    356 
    357 	mulq	$m0
    358 	add	%rax,$A[1]
    359 	mov	8($np),%rax
    360 	adc	\$0,%rdx
    361 	mov	%rdx,$A[0]
    362 
    363 	mulq	$m1
    364 	add	%rax,$N[1]
    365 	mov	16($ap),%rax
    366 	adc	\$0,%rdx
    367 	add	$A[1],$N[1]
    368 	lea	4($j),$j		# j++
    369 	adc	\$0,%rdx
    370 	mov	$N[1],(%rsp)
    371 	mov	%rdx,$N[0]
    372 	jmp	.L1st4x
    373 .align	16
    374 .L1st4x:
    375 	mulq	$m0			# ap[j]*bp[0]
    376 	add	%rax,$A[0]
    377 	mov	-16($np,$j,8),%rax
    378 	adc	\$0,%rdx
    379 	mov	%rdx,$A[1]
    380 
    381 	mulq	$m1			# np[j]*m1
    382 	add	%rax,$N[0]
    383 	mov	-8($ap,$j,8),%rax
    384 	adc	\$0,%rdx
    385 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    386 	adc	\$0,%rdx
    387 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    388 	mov	%rdx,$N[1]
    389 
    390 	mulq	$m0			# ap[j]*bp[0]
    391 	add	%rax,$A[1]
    392 	mov	-8($np,$j,8),%rax
    393 	adc	\$0,%rdx
    394 	mov	%rdx,$A[0]
    395 
    396 	mulq	$m1			# np[j]*m1
    397 	add	%rax,$N[1]
    398 	mov	($ap,$j,8),%rax
    399 	adc	\$0,%rdx
    400 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    401 	adc	\$0,%rdx
    402 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    403 	mov	%rdx,$N[0]
    404 
    405 	mulq	$m0			# ap[j]*bp[0]
    406 	add	%rax,$A[0]
    407 	mov	($np,$j,8),%rax
    408 	adc	\$0,%rdx
    409 	mov	%rdx,$A[1]
    410 
    411 	mulq	$m1			# np[j]*m1
    412 	add	%rax,$N[0]
    413 	mov	8($ap,$j,8),%rax
    414 	adc	\$0,%rdx
    415 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    416 	adc	\$0,%rdx
    417 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    418 	mov	%rdx,$N[1]
    419 
    420 	mulq	$m0			# ap[j]*bp[0]
    421 	add	%rax,$A[1]
    422 	mov	8($np,$j,8),%rax
    423 	adc	\$0,%rdx
    424 	lea	4($j),$j		# j++
    425 	mov	%rdx,$A[0]
    426 
    427 	mulq	$m1			# np[j]*m1
    428 	add	%rax,$N[1]
    429 	mov	-16($ap,$j,8),%rax
    430 	adc	\$0,%rdx
    431 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    432 	adc	\$0,%rdx
    433 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    434 	mov	%rdx,$N[0]
    435 	cmp	$num,$j
    436 	jb	.L1st4x
    437 
    438 	mulq	$m0			# ap[j]*bp[0]
    439 	add	%rax,$A[0]
    440 	mov	-16($np,$j,8),%rax
    441 	adc	\$0,%rdx
    442 	mov	%rdx,$A[1]
    443 
    444 	mulq	$m1			# np[j]*m1
    445 	add	%rax,$N[0]
    446 	mov	-8($ap,$j,8),%rax
    447 	adc	\$0,%rdx
    448 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    449 	adc	\$0,%rdx
    450 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    451 	mov	%rdx,$N[1]
    452 
    453 	mulq	$m0			# ap[j]*bp[0]
    454 	add	%rax,$A[1]
    455 	mov	-8($np,$j,8),%rax
    456 	adc	\$0,%rdx
    457 	mov	%rdx,$A[0]
    458 
    459 	mulq	$m1			# np[j]*m1
    460 	add	%rax,$N[1]
    461 	mov	($ap),%rax		# ap[0]
    462 	adc	\$0,%rdx
    463 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    464 	adc	\$0,%rdx
    465 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    466 	mov	%rdx,$N[0]
    467 
    468 	xor	$N[1],$N[1]
    469 	add	$A[0],$N[0]
    470 	adc	\$0,$N[1]
    471 	mov	$N[0],-8(%rsp,$j,8)
    472 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    473 
    474 	lea	1($i),$i		# i++
    475 .align	4
    476 .Louter4x:
    477 	mov	($bp,$i,8),$m0		# m0=bp[i]
    478 	xor	$j,$j			# j=0
    479 	mov	(%rsp),$A[0]
    480 	mov	$n0,$m1
    481 	mulq	$m0			# ap[0]*bp[i]
    482 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    483 	mov	($np),%rax
    484 	adc	\$0,%rdx
    485 
    486 	imulq	$A[0],$m1		# tp[0]*n0
    487 	mov	%rdx,$A[1]
    488 
    489 	mulq	$m1			# np[0]*m1
    490 	add	%rax,$A[0]		# "$N[0]", discarded
    491 	mov	8($ap),%rax
    492 	adc	\$0,%rdx
    493 	mov	%rdx,$N[1]
    494 
    495 	mulq	$m0			# ap[j]*bp[i]
    496 	add	%rax,$A[1]
    497 	mov	8($np),%rax
    498 	adc	\$0,%rdx
    499 	add	8(%rsp),$A[1]		# +tp[1]
    500 	adc	\$0,%rdx
    501 	mov	%rdx,$A[0]
    502 
    503 	mulq	$m1			# np[j]*m1
    504 	add	%rax,$N[1]
    505 	mov	16($ap),%rax
    506 	adc	\$0,%rdx
    507 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    508 	lea	4($j),$j		# j+=2
    509 	adc	\$0,%rdx
    510 	mov	$N[1],(%rsp)		# tp[j-1]
    511 	mov	%rdx,$N[0]
    512 	jmp	.Linner4x
    513 .align	16
    514 .Linner4x:
    515 	mulq	$m0			# ap[j]*bp[i]
    516 	add	%rax,$A[0]
    517 	mov	-16($np,$j,8),%rax
    518 	adc	\$0,%rdx
    519 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    520 	adc	\$0,%rdx
    521 	mov	%rdx,$A[1]
    522 
    523 	mulq	$m1			# np[j]*m1
    524 	add	%rax,$N[0]
    525 	mov	-8($ap,$j,8),%rax
    526 	adc	\$0,%rdx
    527 	add	$A[0],$N[0]
    528 	adc	\$0,%rdx
    529 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    530 	mov	%rdx,$N[1]
    531 
    532 	mulq	$m0			# ap[j]*bp[i]
    533 	add	%rax,$A[1]
    534 	mov	-8($np,$j,8),%rax
    535 	adc	\$0,%rdx
    536 	add	-8(%rsp,$j,8),$A[1]
    537 	adc	\$0,%rdx
    538 	mov	%rdx,$A[0]
    539 
    540 	mulq	$m1			# np[j]*m1
    541 	add	%rax,$N[1]
    542 	mov	($ap,$j,8),%rax
    543 	adc	\$0,%rdx
    544 	add	$A[1],$N[1]
    545 	adc	\$0,%rdx
    546 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    547 	mov	%rdx,$N[0]
    548 
    549 	mulq	$m0			# ap[j]*bp[i]
    550 	add	%rax,$A[0]
    551 	mov	($np,$j,8),%rax
    552 	adc	\$0,%rdx
    553 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    554 	adc	\$0,%rdx
    555 	mov	%rdx,$A[1]
    556 
    557 	mulq	$m1			# np[j]*m1
    558 	add	%rax,$N[0]
    559 	mov	8($ap,$j,8),%rax
    560 	adc	\$0,%rdx
    561 	add	$A[0],$N[0]
    562 	adc	\$0,%rdx
    563 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    564 	mov	%rdx,$N[1]
    565 
    566 	mulq	$m0			# ap[j]*bp[i]
    567 	add	%rax,$A[1]
    568 	mov	8($np,$j,8),%rax
    569 	adc	\$0,%rdx
    570 	add	8(%rsp,$j,8),$A[1]
    571 	adc	\$0,%rdx
    572 	lea	4($j),$j		# j++
    573 	mov	%rdx,$A[0]
    574 
    575 	mulq	$m1			# np[j]*m1
    576 	add	%rax,$N[1]
    577 	mov	-16($ap,$j,8),%rax
    578 	adc	\$0,%rdx
    579 	add	$A[1],$N[1]
    580 	adc	\$0,%rdx
    581 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    582 	mov	%rdx,$N[0]
    583 	cmp	$num,$j
    584 	jb	.Linner4x
    585 
    586 	mulq	$m0			# ap[j]*bp[i]
    587 	add	%rax,$A[0]
    588 	mov	-16($np,$j,8),%rax
    589 	adc	\$0,%rdx
    590 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    591 	adc	\$0,%rdx
    592 	mov	%rdx,$A[1]
    593 
    594 	mulq	$m1			# np[j]*m1
    595 	add	%rax,$N[0]
    596 	mov	-8($ap,$j,8),%rax
    597 	adc	\$0,%rdx
    598 	add	$A[0],$N[0]
    599 	adc	\$0,%rdx
    600 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    601 	mov	%rdx,$N[1]
    602 
    603 	mulq	$m0			# ap[j]*bp[i]
    604 	add	%rax,$A[1]
    605 	mov	-8($np,$j,8),%rax
    606 	adc	\$0,%rdx
    607 	add	-8(%rsp,$j,8),$A[1]
    608 	adc	\$0,%rdx
    609 	lea	1($i),$i		# i++
    610 	mov	%rdx,$A[0]
    611 
    612 	mulq	$m1			# np[j]*m1
    613 	add	%rax,$N[1]
    614 	mov	($ap),%rax		# ap[0]
    615 	adc	\$0,%rdx
    616 	add	$A[1],$N[1]
    617 	adc	\$0,%rdx
    618 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    619 	mov	%rdx,$N[0]
    620 
    621 	xor	$N[1],$N[1]
    622 	add	$A[0],$N[0]
    623 	adc	\$0,$N[1]
    624 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    625 	adc	\$0,$N[1]
    626 	mov	$N[0],-8(%rsp,$j,8)
    627 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    628 
    629 	cmp	$num,$i
    630 	jb	.Louter4x
    631 ___
    632 {
    633 my @ri=("%rax","%rdx",$m0,$m1);
    634 $code.=<<___;
    635 	mov	16(%rsp,$num,8),$rp	# restore $rp
    636 	mov	0(%rsp),@ri[0]		# tp[0]
    637 	mov	8(%rsp),@ri[1]		# tp[1]
    638 	shr	\$2,$num		# num/=4
    639 	lea	(%rsp),$ap		# borrow ap for tp
    640 	xor	$i,$i			# i=0 and clear CF!
    641 
    642 	sub	0($np),@ri[0]
    643 	mov	16($ap),@ri[2]		# tp[2]
    644 	mov	24($ap),@ri[3]		# tp[3]
    645 	sbb	8($np),@ri[1]
    646 	lea	-1($num),$j		# j=num/4-1
    647 	jmp	.Lsub4x
    648 .align	16
    649 .Lsub4x:
    650 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    651 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    652 	sbb	16($np,$i,8),@ri[2]
    653 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    654 	mov	40($ap,$i,8),@ri[1]
    655 	sbb	24($np,$i,8),@ri[3]
    656 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    657 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    658 	sbb	32($np,$i,8),@ri[0]
    659 	mov	48($ap,$i,8),@ri[2]
    660 	mov	56($ap,$i,8),@ri[3]
    661 	sbb	40($np,$i,8),@ri[1]
    662 	lea	4($i),$i		# i++
    663 	dec	$j			# doesnn't affect CF!
    664 	jnz	.Lsub4x
    665 
    666 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    667 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    668 	sbb	16($np,$i,8),@ri[2]
    669 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    670 	sbb	24($np,$i,8),@ri[3]
    671 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    672 
    673 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    674 	mov	@ri[0],%xmm0
    675 	punpcklqdq %xmm0,%xmm0		# extend mask to 128 bits
    676 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    677 	xor	$i,$i			# i=0
    678 
    679 	mov	$num,$j
    680 	pxor	%xmm5,%xmm5
    681 	jmp	.Lcopy4x
    682 .align	16
    683 .Lcopy4x:				# copy or in-place refresh
    684 	movdqu	(%rsp,$i),%xmm2
    685 	movdqu  16(%rsp,$i),%xmm4
    686 	movdqu	($rp,$i),%xmm1
    687 	movdqu	16($rp,$i),%xmm3
    688 	pxor	%xmm1,%xmm2		# conditional select
    689 	pxor	%xmm3,%xmm4
    690 	pand	%xmm0,%xmm2
    691 	pand	%xmm0,%xmm4
    692 	pxor	%xmm1,%xmm2
    693 	pxor	%xmm3,%xmm4
    694 	movdqu	%xmm2,($rp,$i)
    695 	movdqu  %xmm4,16($rp,$i)
    696 	movdqa	%xmm5,(%rsp,$i)		# zap temporary vectors
    697 	movdqa	%xmm5,16(%rsp,$i)
    698 
    699 	lea	32($i),$i
    700 	dec	$j
    701 	jnz	.Lcopy4x
    702 
    703 	shl	\$2,$num
    704 ___
    705 }
    706 $code.=<<___;
    707 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    708 	mov	\$1,%rax
    709 	mov	(%rsi),%r15
    710 	mov	8(%rsi),%r14
    711 	mov	16(%rsi),%r13
    712 	mov	24(%rsi),%r12
    713 	mov	32(%rsi),%rbp
    714 	mov	40(%rsi),%rbx
    715 	lea	48(%rsi),%rsp
    716 .Lmul4x_epilogue:
    717 	ret
    718 .size	bn_mul4x_mont,.-bn_mul4x_mont
    719 ___
    720 }}}
    721 {{{
    723 ######################################################################
    724 # void bn_sqr8x_mont(
    725 my $rptr="%rdi";	# const BN_ULONG *rptr,
    726 my $aptr="%rsi";	# const BN_ULONG *aptr,
    727 my $bptr="%rdx";	# not used
    728 my $nptr="%rcx";	# const BN_ULONG *nptr,
    729 my $n0  ="%r8";		# const BN_ULONG *n0);
    730 my $num ="%r9";		# int num, has to be divisible by 8
    731 
    732 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    733 my @A0=("%r10","%r11");
    734 my @A1=("%r12","%r13");
    735 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    736 
    737 $code.=<<___	if ($addx);
    738 .extern	bn_sqrx8x_internal		# see x86_64-mont5 module
    739 ___
    740 $code.=<<___;
    741 .extern	bn_sqr8x_internal		# see x86_64-mont5 module
    742 
    743 .type	bn_sqr8x_mont,\@function,6
    744 .align	32
    745 bn_sqr8x_mont:
    746 .Lsqr8x_enter:
    747 	mov	%rsp,%rax
    748 	push	%rbx
    749 	push	%rbp
    750 	push	%r12
    751 	push	%r13
    752 	push	%r14
    753 	push	%r15
    754 
    755 	mov	${num}d,%r10d
    756 	shl	\$3,${num}d		# convert $num to bytes
    757 	shl	\$3+2,%r10		# 4*$num
    758 	neg	$num
    759 
    760 	##############################################################
    761 	# ensure that stack frame doesn't alias with $aptr modulo
    762 	# 4096. this is done to allow memory disambiguation logic
    763 	# do its job.
    764 	#
    765 	lea	-64(%rsp,$num,4),%r11
    766 	mov	($n0),$n0		# *n0
    767 	sub	$aptr,%r11
    768 	and	\$4095,%r11
    769 	cmp	%r11,%r10
    770 	jb	.Lsqr8x_sp_alt
    771 	sub	%r11,%rsp		# align with $aptr
    772 	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
    773 	jmp	.Lsqr8x_sp_done
    774 
    775 .align	32
    776 .Lsqr8x_sp_alt:
    777 	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
    778 	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
    779 	sub	%r10,%r11
    780 	mov	\$0,%r10
    781 	cmovc	%r10,%r11
    782 	sub	%r11,%rsp
    783 .Lsqr8x_sp_done:
    784 	and	\$-64,%rsp
    785 	mov	$num,%r10	
    786 	neg	$num
    787 
    788 	lea	64(%rsp,$num,2),%r11	# copy of modulus
    789 	mov	$n0,  32(%rsp)
    790 	mov	%rax, 40(%rsp)		# save original %rsp
    791 .Lsqr8x_body:
    792 
    793 	mov	$num,$i
    794 	movq	%r11, %xmm2		# save pointer to modulus copy
    795 	shr	\$3+2,$i
    796 	mov	OPENSSL_ia32cap_P+8(%rip),%eax
    797 	jmp	.Lsqr8x_copy_n
    798 
    799 .align	32
    800 .Lsqr8x_copy_n:
    801 	movq	8*0($nptr),%xmm0
    802 	movq	8*1($nptr),%xmm1
    803 	movq	8*2($nptr),%xmm3
    804 	movq	8*3($nptr),%xmm4
    805 	lea	8*4($nptr),$nptr
    806 	movdqa	%xmm0,16*0(%r11)
    807 	movdqa	%xmm1,16*1(%r11)
    808 	movdqa	%xmm3,16*2(%r11)
    809 	movdqa	%xmm4,16*3(%r11)
    810 	lea	16*4(%r11),%r11
    811 	dec	$i
    812 	jnz	.Lsqr8x_copy_n
    813 
    814 	pxor	%xmm0,%xmm0
    815 	movq	$rptr,%xmm1		# save $rptr
    816 	movq	%r10, %xmm3		# -$num
    817 ___
    818 $code.=<<___ if ($addx);
    819 	and	\$0x80100,%eax
    820 	cmp	\$0x80100,%eax
    821 	jne	.Lsqr8x_nox
    822 
    823 	call	bn_sqrx8x_internal	# see x86_64-mont5 module
    824 
    825 	pxor	%xmm0,%xmm0
    826 	lea	48(%rsp),%rax
    827 	lea	64(%rsp,$num,2),%rdx
    828 	shr	\$3+2,$num
    829 	mov	40(%rsp),%rsi		# restore %rsp
    830 	jmp	.Lsqr8x_zero
    831 
    832 .align	32
    833 .Lsqr8x_nox:
    834 ___
    835 $code.=<<___;
    836 	call	bn_sqr8x_internal	# see x86_64-mont5 module
    837 
    838 	pxor	%xmm0,%xmm0
    839 	lea	48(%rsp),%rax
    840 	lea	64(%rsp,$num,2),%rdx
    841 	shr	\$3+2,$num
    842 	mov	40(%rsp),%rsi		# restore %rsp
    843 	jmp	.Lsqr8x_zero
    844 
    845 .align	32
    846 .Lsqr8x_zero:
    847 	movdqa	%xmm0,16*0(%rax)	# wipe t
    848 	movdqa	%xmm0,16*1(%rax)
    849 	movdqa	%xmm0,16*2(%rax)
    850 	movdqa	%xmm0,16*3(%rax)
    851 	lea	16*4(%rax),%rax
    852 	movdqa	%xmm0,16*0(%rdx)	# wipe n
    853 	movdqa	%xmm0,16*1(%rdx)
    854 	movdqa	%xmm0,16*2(%rdx)
    855 	movdqa	%xmm0,16*3(%rdx)
    856 	lea	16*4(%rdx),%rdx
    857 	dec	$num
    858 	jnz	.Lsqr8x_zero
    859 
    860 	mov	\$1,%rax
    861 	mov	-48(%rsi),%r15
    862 	mov	-40(%rsi),%r14
    863 	mov	-32(%rsi),%r13
    864 	mov	-24(%rsi),%r12
    865 	mov	-16(%rsi),%rbp
    866 	mov	-8(%rsi),%rbx
    867 	lea	(%rsi),%rsp
    868 .Lsqr8x_epilogue:
    869 	ret
    870 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
    871 ___
    872 }}}
    873 
    875 if ($addx) {{{
    876 my $bp="%rdx";	# original value
    877 
    878 $code.=<<___;
    879 .type	bn_mulx4x_mont,\@function,6
    880 .align	32
    881 bn_mulx4x_mont:
    882 .Lmulx4x_enter:
    883 	mov	%rsp,%rax
    884 	push	%rbx
    885 	push	%rbp
    886 	push	%r12
    887 	push	%r13
    888 	push	%r14
    889 	push	%r15
    890 
    891 	shl	\$3,${num}d		# convert $num to bytes
    892 	.byte	0x67
    893 	xor	%r10,%r10
    894 	sub	$num,%r10		# -$num
    895 	mov	($n0),$n0		# *n0
    896 	lea	-72(%rsp,%r10),%rsp	# alloca(frame+$num+8)
    897 	lea	($bp,$num),%r10
    898 	and	\$-128,%rsp
    899 	##############################################################
    900 	# Stack layout
    901 	# +0	num
    902 	# +8	off-loaded &b[i]
    903 	# +16	end of b[num]
    904 	# +24	saved n0
    905 	# +32	saved rp
    906 	# +40	saved %rsp
    907 	# +48	inner counter
    908 	# +56
    909 	# +64	tmp[num+1]
    910 	#
    911 	mov	$num,0(%rsp)		# save $num
    912 	shr	\$5,$num
    913 	mov	%r10,16(%rsp)		# end of b[num]
    914 	sub	\$1,$num
    915 	mov	$n0, 24(%rsp)		# save *n0
    916 	mov	$rp, 32(%rsp)		# save $rp
    917 	mov	%rax,40(%rsp)		# save original %rsp
    918 	mov	$num,48(%rsp)		# inner counter
    919 	jmp	.Lmulx4x_body
    920 
    921 .align	32
    922 .Lmulx4x_body:
    923 ___
    924 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
    925    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
    926 my $rptr=$bptr;
    927 $code.=<<___;
    928 	lea	8($bp),$bptr
    929 	mov	($bp),%rdx		# b[0], $bp==%rdx actually
    930 	lea	64+32(%rsp),$tptr
    931 	mov	%rdx,$bi
    932 
    933 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
    934 	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
    935 	add	%rax,%r11
    936 	mov	$bptr,8(%rsp)		# off-load &b[i]
    937 	mulx	2*8($aptr),%r12,%r13	# ...
    938 	adc	%r14,%r12
    939 	adc	\$0,%r13
    940 
    941 	mov	$mi,$bptr		# borrow $bptr
    942 	imulq	24(%rsp),$mi		# "t[0]"*n0
    943 	xor	$zero,$zero		# cf=0, of=0
    944 
    945 	mulx	3*8($aptr),%rax,%r14
    946 	 mov	$mi,%rdx
    947 	lea	4*8($aptr),$aptr
    948 	adcx	%rax,%r13
    949 	adcx	$zero,%r14		# cf=0
    950 
    951 	mulx	0*8($nptr),%rax,%r10
    952 	adcx	%rax,$bptr		# discarded
    953 	adox	%r11,%r10
    954 	mulx	1*8($nptr),%rax,%r11
    955 	adcx	%rax,%r10
    956 	adox	%r12,%r11
    957 	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
    958 	mov	48(%rsp),$bptr		# counter value
    959 	mov	%r10,-4*8($tptr)
    960 	adcx	%rax,%r11
    961 	adox	%r13,%r12
    962 	mulx	3*8($nptr),%rax,%r15
    963 	 mov	$bi,%rdx
    964 	mov	%r11,-3*8($tptr)
    965 	adcx	%rax,%r12
    966 	adox	$zero,%r15		# of=0
    967 	lea	4*8($nptr),$nptr
    968 	mov	%r12,-2*8($tptr)
    969 
    970 	jmp	.Lmulx4x_1st
    971 
    972 .align	32
    973 .Lmulx4x_1st:
    974 	adcx	$zero,%r15		# cf=0, modulo-scheduled
    975 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
    976 	adcx	%r14,%r10
    977 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
    978 	adcx	%rax,%r11
    979 	mulx	2*8($aptr),%r12,%rax	# ...
    980 	adcx	%r14,%r12
    981 	mulx	3*8($aptr),%r13,%r14
    982 	 .byte	0x67,0x67
    983 	 mov	$mi,%rdx
    984 	adcx	%rax,%r13
    985 	adcx	$zero,%r14		# cf=0
    986 	lea	4*8($aptr),$aptr
    987 	lea	4*8($tptr),$tptr
    988 
    989 	adox	%r15,%r10
    990 	mulx	0*8($nptr),%rax,%r15
    991 	adcx	%rax,%r10
    992 	adox	%r15,%r11
    993 	mulx	1*8($nptr),%rax,%r15
    994 	adcx	%rax,%r11
    995 	adox	%r15,%r12
    996 	mulx	2*8($nptr),%rax,%r15
    997 	mov	%r10,-5*8($tptr)
    998 	adcx	%rax,%r12
    999 	mov	%r11,-4*8($tptr)
   1000 	adox	%r15,%r13
   1001 	mulx	3*8($nptr),%rax,%r15
   1002 	 mov	$bi,%rdx
   1003 	mov	%r12,-3*8($tptr)
   1004 	adcx	%rax,%r13
   1005 	adox	$zero,%r15
   1006 	lea	4*8($nptr),$nptr
   1007 	mov	%r13,-2*8($tptr)
   1008 
   1009 	dec	$bptr			# of=0, pass cf
   1010 	jnz	.Lmulx4x_1st
   1011 
   1012 	mov	0(%rsp),$num		# load num
   1013 	mov	8(%rsp),$bptr		# re-load &b[i]
   1014 	adc	$zero,%r15		# modulo-scheduled
   1015 	add	%r15,%r14
   1016 	sbb	%r15,%r15		# top-most carry
   1017 	mov	%r14,-1*8($tptr)
   1018 	jmp	.Lmulx4x_outer
   1019 
   1020 .align	32
   1021 .Lmulx4x_outer:
   1022 	mov	($bptr),%rdx		# b[i]
   1023 	lea	8($bptr),$bptr		# b++
   1024 	sub	$num,$aptr		# rewind $aptr
   1025 	mov	%r15,($tptr)		# save top-most carry
   1026 	lea	64+4*8(%rsp),$tptr
   1027 	sub	$num,$nptr		# rewind $nptr
   1028 
   1029 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
   1030 	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
   1031 	mov	%rdx,$bi
   1032 	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
   1033 	adox	-4*8($tptr),$mi
   1034 	adcx	%r14,%r11
   1035 	mulx	2*8($aptr),%r15,%r13	# ...
   1036 	adox	-3*8($tptr),%r11
   1037 	adcx	%r15,%r12
   1038 	adox	$zero,%r12
   1039 	adcx	$zero,%r13
   1040 
   1041 	mov	$bptr,8(%rsp)		# off-load &b[i]
   1042 	.byte	0x67
   1043 	mov	$mi,%r15
   1044 	imulq	24(%rsp),$mi		# "t[0]"*n0
   1045 	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
   1046 
   1047 	mulx	3*8($aptr),%rax,%r14
   1048 	 mov	$mi,%rdx
   1049 	adox	-2*8($tptr),%r12
   1050 	adcx	%rax,%r13
   1051 	adox	-1*8($tptr),%r13
   1052 	adcx	$zero,%r14
   1053 	lea	4*8($aptr),$aptr
   1054 	adox	$zero,%r14
   1055 
   1056 	mulx	0*8($nptr),%rax,%r10
   1057 	adcx	%rax,%r15		# discarded
   1058 	adox	%r11,%r10
   1059 	mulx	1*8($nptr),%rax,%r11
   1060 	adcx	%rax,%r10
   1061 	adox	%r12,%r11
   1062 	mulx	2*8($nptr),%rax,%r12
   1063 	mov	%r10,-4*8($tptr)
   1064 	adcx	%rax,%r11
   1065 	adox	%r13,%r12
   1066 	mulx	3*8($nptr),%rax,%r15
   1067 	 mov	$bi,%rdx
   1068 	mov	%r11,-3*8($tptr)
   1069 	lea	4*8($nptr),$nptr
   1070 	adcx	%rax,%r12
   1071 	adox	$zero,%r15		# of=0
   1072 	mov	48(%rsp),$bptr		# counter value
   1073 	mov	%r12,-2*8($tptr)
   1074 
   1075 	jmp	.Lmulx4x_inner
   1076 
   1077 .align	32
   1078 .Lmulx4x_inner:
   1079 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
   1080 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   1081 	adox	%r14,%r10
   1082 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
   1083 	adcx	0*8($tptr),%r10
   1084 	adox	%rax,%r11
   1085 	mulx	2*8($aptr),%r12,%rax	# ...
   1086 	adcx	1*8($tptr),%r11
   1087 	adox	%r14,%r12
   1088 	mulx	3*8($aptr),%r13,%r14
   1089 	 mov	$mi,%rdx
   1090 	adcx	2*8($tptr),%r12
   1091 	adox	%rax,%r13
   1092 	adcx	3*8($tptr),%r13
   1093 	adox	$zero,%r14		# of=0
   1094 	lea	4*8($aptr),$aptr
   1095 	lea	4*8($tptr),$tptr
   1096 	adcx	$zero,%r14		# cf=0
   1097 
   1098 	adox	%r15,%r10
   1099 	mulx	0*8($nptr),%rax,%r15
   1100 	adcx	%rax,%r10
   1101 	adox	%r15,%r11
   1102 	mulx	1*8($nptr),%rax,%r15
   1103 	adcx	%rax,%r11
   1104 	adox	%r15,%r12
   1105 	mulx	2*8($nptr),%rax,%r15
   1106 	mov	%r10,-5*8($tptr)
   1107 	adcx	%rax,%r12
   1108 	adox	%r15,%r13
   1109 	mulx	3*8($nptr),%rax,%r15
   1110 	 mov	$bi,%rdx
   1111 	mov	%r11,-4*8($tptr)
   1112 	mov	%r12,-3*8($tptr)
   1113 	adcx	%rax,%r13
   1114 	adox	$zero,%r15
   1115 	lea	4*8($nptr),$nptr
   1116 	mov	%r13,-2*8($tptr)
   1117 
   1118 	dec	$bptr			# of=0, pass cf
   1119 	jnz	.Lmulx4x_inner
   1120 
   1121 	mov	0(%rsp),$num		# load num
   1122 	mov	8(%rsp),$bptr		# re-load &b[i]
   1123 	adc	$zero,%r15		# modulo-scheduled
   1124 	sub	0*8($tptr),$zero	# pull top-most carry
   1125 	adc	%r15,%r14
   1126 	mov	-8($nptr),$mi
   1127 	sbb	%r15,%r15		# top-most carry
   1128 	mov	%r14,-1*8($tptr)
   1129 
   1130 	cmp	16(%rsp),$bptr
   1131 	jne	.Lmulx4x_outer
   1132 
   1133 	sub	%r14,$mi		# compare top-most words
   1134 	sbb	$mi,$mi
   1135 	or	$mi,%r15
   1136 
   1137 	neg	$num
   1138 	xor	%rdx,%rdx
   1139 	mov	32(%rsp),$rptr		# restore rp
   1140 	lea	64(%rsp),$tptr
   1141 
   1142 	pxor	%xmm0,%xmm0
   1143 	mov	0*8($nptr,$num),%r8
   1144 	mov	1*8($nptr,$num),%r9
   1145 	neg	%r8
   1146 	jmp	.Lmulx4x_sub_entry
   1147 
   1148 .align	32
   1149 .Lmulx4x_sub:
   1150 	mov	0*8($nptr,$num),%r8
   1151 	mov	1*8($nptr,$num),%r9
   1152 	not	%r8
   1153 .Lmulx4x_sub_entry:
   1154 	mov	2*8($nptr,$num),%r10
   1155 	not	%r9
   1156 	and	%r15,%r8
   1157 	mov	3*8($nptr,$num),%r11
   1158 	not	%r10
   1159 	and	%r15,%r9
   1160 	not	%r11
   1161 	and	%r15,%r10
   1162 	and	%r15,%r11
   1163 
   1164 	neg	%rdx			# mov %rdx,%cf
   1165 	adc	0*8($tptr),%r8
   1166 	adc	1*8($tptr),%r9
   1167 	movdqa	%xmm0,($tptr)
   1168 	adc	2*8($tptr),%r10
   1169 	adc	3*8($tptr),%r11
   1170 	movdqa	%xmm0,16($tptr)
   1171 	lea	4*8($tptr),$tptr
   1172 	sbb	%rdx,%rdx		# mov %cf,%rdx
   1173 
   1174 	mov	%r8,0*8($rptr)
   1175 	mov	%r9,1*8($rptr)
   1176 	mov	%r10,2*8($rptr)
   1177 	mov	%r11,3*8($rptr)
   1178 	lea	4*8($rptr),$rptr
   1179 
   1180 	add	\$32,$num
   1181 	jnz	.Lmulx4x_sub
   1182 
   1183 	mov	40(%rsp),%rsi		# restore %rsp
   1184 	mov	\$1,%rax
   1185 	mov	-48(%rsi),%r15
   1186 	mov	-40(%rsi),%r14
   1187 	mov	-32(%rsi),%r13
   1188 	mov	-24(%rsi),%r12
   1189 	mov	-16(%rsi),%rbp
   1190 	mov	-8(%rsi),%rbx
   1191 	lea	(%rsi),%rsp
   1192 .Lmulx4x_epilogue:
   1193 	ret
   1194 .size	bn_mulx4x_mont,.-bn_mulx4x_mont
   1195 ___
   1196 }}}
   1197 $code.=<<___;
   1198 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1199 .align	16
   1200 ___
   1201 
   1202 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1203 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1204 if ($win64) {
   1205 $rec="%rcx";
   1206 $frame="%rdx";
   1207 $context="%r8";
   1208 $disp="%r9";
   1209 
   1210 $code.=<<___;
   1211 .extern	__imp_RtlVirtualUnwind
   1212 .type	mul_handler,\@abi-omnipotent
   1213 .align	16
   1214 mul_handler:
   1215 	push	%rsi
   1216 	push	%rdi
   1217 	push	%rbx
   1218 	push	%rbp
   1219 	push	%r12
   1220 	push	%r13
   1221 	push	%r14
   1222 	push	%r15
   1223 	pushfq
   1224 	sub	\$64,%rsp
   1225 
   1226 	mov	120($context),%rax	# pull context->Rax
   1227 	mov	248($context),%rbx	# pull context->Rip
   1228 
   1229 	mov	8($disp),%rsi		# disp->ImageBase
   1230 	mov	56($disp),%r11		# disp->HandlerData
   1231 
   1232 	mov	0(%r11),%r10d		# HandlerData[0]
   1233 	lea	(%rsi,%r10),%r10	# end of prologue label
   1234 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   1235 	jb	.Lcommon_seh_tail
   1236 
   1237 	mov	152($context),%rax	# pull context->Rsp
   1238 
   1239 	mov	4(%r11),%r10d		# HandlerData[1]
   1240 	lea	(%rsi,%r10),%r10	# epilogue label
   1241 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1242 	jae	.Lcommon_seh_tail
   1243 
   1244 	mov	192($context),%r10	# pull $num
   1245 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   1246 	lea	48(%rax),%rax
   1247 
   1248 	mov	-8(%rax),%rbx
   1249 	mov	-16(%rax),%rbp
   1250 	mov	-24(%rax),%r12
   1251 	mov	-32(%rax),%r13
   1252 	mov	-40(%rax),%r14
   1253 	mov	-48(%rax),%r15
   1254 	mov	%rbx,144($context)	# restore context->Rbx
   1255 	mov	%rbp,160($context)	# restore context->Rbp
   1256 	mov	%r12,216($context)	# restore context->R12
   1257 	mov	%r13,224($context)	# restore context->R13
   1258 	mov	%r14,232($context)	# restore context->R14
   1259 	mov	%r15,240($context)	# restore context->R15
   1260 
   1261 	jmp	.Lcommon_seh_tail
   1262 .size	mul_handler,.-mul_handler
   1263 
   1264 .type	sqr_handler,\@abi-omnipotent
   1265 .align	16
   1266 sqr_handler:
   1267 	push	%rsi
   1268 	push	%rdi
   1269 	push	%rbx
   1270 	push	%rbp
   1271 	push	%r12
   1272 	push	%r13
   1273 	push	%r14
   1274 	push	%r15
   1275 	pushfq
   1276 	sub	\$64,%rsp
   1277 
   1278 	mov	120($context),%rax	# pull context->Rax
   1279 	mov	248($context),%rbx	# pull context->Rip
   1280 
   1281 	mov	8($disp),%rsi		# disp->ImageBase
   1282 	mov	56($disp),%r11		# disp->HandlerData
   1283 
   1284 	mov	0(%r11),%r10d		# HandlerData[0]
   1285 	lea	(%rsi,%r10),%r10	# end of prologue label
   1286 	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
   1287 	jb	.Lcommon_seh_tail
   1288 
   1289 	mov	152($context),%rax	# pull context->Rsp
   1290 
   1291 	mov	4(%r11),%r10d		# HandlerData[1]
   1292 	lea	(%rsi,%r10),%r10	# epilogue label
   1293 	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
   1294 	jae	.Lcommon_seh_tail
   1295 
   1296 	mov	40(%rax),%rax		# pull saved stack pointer
   1297 
   1298 	mov	-8(%rax),%rbx
   1299 	mov	-16(%rax),%rbp
   1300 	mov	-24(%rax),%r12
   1301 	mov	-32(%rax),%r13
   1302 	mov	-40(%rax),%r14
   1303 	mov	-48(%rax),%r15
   1304 	mov	%rbx,144($context)	# restore context->Rbx
   1305 	mov	%rbp,160($context)	# restore context->Rbp
   1306 	mov	%r12,216($context)	# restore context->R12
   1307 	mov	%r13,224($context)	# restore context->R13
   1308 	mov	%r14,232($context)	# restore context->R14
   1309 	mov	%r15,240($context)	# restore context->R15
   1310 
   1311 .Lcommon_seh_tail:
   1312 	mov	8(%rax),%rdi
   1313 	mov	16(%rax),%rsi
   1314 	mov	%rax,152($context)	# restore context->Rsp
   1315 	mov	%rsi,168($context)	# restore context->Rsi
   1316 	mov	%rdi,176($context)	# restore context->Rdi
   1317 
   1318 	mov	40($disp),%rdi		# disp->ContextRecord
   1319 	mov	$context,%rsi		# context
   1320 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1321 	.long	0xa548f3fc		# cld; rep movsq
   1322 
   1323 	mov	$disp,%rsi
   1324 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1325 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1326 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1327 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1328 	mov	40(%rsi),%r10		# disp->ContextRecord
   1329 	lea	56(%rsi),%r11		# &disp->HandlerData
   1330 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1331 	mov	%r10,32(%rsp)		# arg5
   1332 	mov	%r11,40(%rsp)		# arg6
   1333 	mov	%r12,48(%rsp)		# arg7
   1334 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1335 	call	*__imp_RtlVirtualUnwind(%rip)
   1336 
   1337 	mov	\$1,%eax		# ExceptionContinueSearch
   1338 	add	\$64,%rsp
   1339 	popfq
   1340 	pop	%r15
   1341 	pop	%r14
   1342 	pop	%r13
   1343 	pop	%r12
   1344 	pop	%rbp
   1345 	pop	%rbx
   1346 	pop	%rdi
   1347 	pop	%rsi
   1348 	ret
   1349 .size	sqr_handler,.-sqr_handler
   1350 
   1351 .section	.pdata
   1352 .align	4
   1353 	.rva	.LSEH_begin_bn_mul_mont
   1354 	.rva	.LSEH_end_bn_mul_mont
   1355 	.rva	.LSEH_info_bn_mul_mont
   1356 
   1357 	.rva	.LSEH_begin_bn_mul4x_mont
   1358 	.rva	.LSEH_end_bn_mul4x_mont
   1359 	.rva	.LSEH_info_bn_mul4x_mont
   1360 
   1361 	.rva	.LSEH_begin_bn_sqr8x_mont
   1362 	.rva	.LSEH_end_bn_sqr8x_mont
   1363 	.rva	.LSEH_info_bn_sqr8x_mont
   1364 ___
   1365 $code.=<<___ if ($addx);
   1366 	.rva	.LSEH_begin_bn_mulx4x_mont
   1367 	.rva	.LSEH_end_bn_mulx4x_mont
   1368 	.rva	.LSEH_info_bn_mulx4x_mont
   1369 ___
   1370 $code.=<<___;
   1371 .section	.xdata
   1372 .align	8
   1373 .LSEH_info_bn_mul_mont:
   1374 	.byte	9,0,0,0
   1375 	.rva	mul_handler
   1376 	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
   1377 .LSEH_info_bn_mul4x_mont:
   1378 	.byte	9,0,0,0
   1379 	.rva	mul_handler
   1380 	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1381 .LSEH_info_bn_sqr8x_mont:
   1382 	.byte	9,0,0,0
   1383 	.rva	sqr_handler
   1384 	.rva	.Lsqr8x_body,.Lsqr8x_epilogue	# HandlerData[]
   1385 ___
   1386 $code.=<<___ if ($addx);
   1387 .LSEH_info_bn_mulx4x_mont:
   1388 	.byte	9,0,0,0
   1389 	.rva	sqr_handler
   1390 	.rva	.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
   1391 ___
   1392 }
   1393 
   1394 print $code;
   1395 close STDOUT;
   1396