Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # October 2005.
     11 #
     12 # Montgomery multiplication routine for x86_64. While it gives modest
     13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
     14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
     15 # respectful 50%. It remains to be seen if loop unrolling and
     16 # dedicated squaring routine can provide further improvement...
     17 
     18 # July 2011.
     19 #
     20 # Add dedicated squaring procedure. Performance improvement varies
     21 # from platform to platform, but in average it's ~5%/15%/25%/33%
     22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     23 
     24 # August 2011.
     25 #
     26 # Unroll and modulo-schedule inner loops in such manner that they
     27 # are "fallen through" for input lengths of 8, which is critical for
     28 # 1024-bit RSA *sign*. Average performance improvement in comparison
     29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
     30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     31 
     32 # June 2013.
     33 #
     34 # Optimize reduction in squaring procedure and improve 1024+-bit RSA
     35 # sign performance by 10-16% on Intel Sandy Bridge and later
     36 # (virtually same on non-Intel processors).
     37 
     38 # August 2013.
     39 #
     40 # Add MULX/ADOX/ADCX code path.
     41 
     42 $flavour = shift;
     43 $output  = shift;
     44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     45 
     46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     47 
     48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     50 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     51 die "can't locate x86_64-xlate.pl";
     52 
     53 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     54 *STDOUT=*OUT;
     55 
     56 # In upstream, this is controlled by shelling out to the compiler to check
     57 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     58 # output, so this isn't useful anyway.
     59 #
     60 # TODO(davidben): Set $addx to one once build problems are resolved.
     61 $addx = 0;
     62 
     63 # int bn_mul_mont(
     64 $rp="%rdi";	# BN_ULONG *rp,
     65 $ap="%rsi";	# const BN_ULONG *ap,
     66 $bp="%rdx";	# const BN_ULONG *bp,
     67 $np="%rcx";	# const BN_ULONG *np,
     68 $n0="%r8";	# const BN_ULONG *n0,
     69 $num="%r9";	# int num);
     70 $lo0="%r10";
     71 $hi0="%r11";
     72 $hi1="%r13";
     73 $i="%r14";
     74 $j="%r15";
     75 $m0="%rbx";
     76 $m1="%rbp";
     77 
     78 $code=<<___;
     79 .text
     80 
     81 .extern	OPENSSL_ia32cap_P
     82 
     83 .globl	bn_mul_mont
     84 .type	bn_mul_mont,\@function,6
     85 .align	16
     86 bn_mul_mont:
     87 .cfi_startproc
     88 	mov	${num}d,${num}d
     89 	mov	%rsp,%rax
     90 .cfi_def_cfa_register	%rax
     91 	test	\$3,${num}d
     92 	jnz	.Lmul_enter
     93 	cmp	\$8,${num}d
     94 	jb	.Lmul_enter
     95 ___
     96 $code.=<<___ if ($addx);
     97 	leaq	OPENSSL_ia32cap_P(%rip),%r11
     98 	mov	8(%r11),%r11d
     99 ___
    100 $code.=<<___;
    101 	cmp	$ap,$bp
    102 	jne	.Lmul4x_enter
    103 	test	\$7,${num}d
    104 	jz	.Lsqr8x_enter
    105 	jmp	.Lmul4x_enter
    106 
    107 .align	16
    108 .Lmul_enter:
    109 	push	%rbx
    110 .cfi_push	%rbx
    111 	push	%rbp
    112 .cfi_push	%rbp
    113 	push	%r12
    114 .cfi_push	%r12
    115 	push	%r13
    116 .cfi_push	%r13
    117 	push	%r14
    118 .cfi_push	%r14
    119 	push	%r15
    120 .cfi_push	%r15
    121 
    122 	neg	$num
    123 	mov	%rsp,%r11
    124 	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
    125 	neg	$num			# restore $num
    126 	and	\$-1024,%r10		# minimize TLB usage
    127 
    128 	# An OS-agnostic version of __chkstk.
    129 	#
    130 	# Some OSes (Windows) insist on stack being "wired" to
    131 	# physical memory in strictly sequential manner, i.e. if stack
    132 	# allocation spans two pages, then reference to farmost one can
    133 	# be punishable by SEGV. But page walking can do good even on
    134 	# other OSes, because it guarantees that villain thread hits
    135 	# the guard page before it can make damage to innocent one...
    136 	sub	%r10,%r11
    137 	and	\$-4096,%r11
    138 	lea	(%r10,%r11),%rsp
    139 	mov	(%rsp),%r11
    140 	cmp	%r10,%rsp
    141 	ja	.Lmul_page_walk
    142 	jmp	.Lmul_page_walk_done
    143 
    144 .align	16
    145 .Lmul_page_walk:
    146 	lea	-4096(%rsp),%rsp
    147 	mov	(%rsp),%r11
    148 	cmp	%r10,%rsp
    149 	ja	.Lmul_page_walk
    150 .Lmul_page_walk_done:
    151 
    152 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    153 .cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
    154 .Lmul_body:
    155 	mov	$bp,%r12		# reassign $bp
    156 ___
    157 		$bp="%r12";
    158 $code.=<<___;
    159 	mov	($n0),$n0		# pull n0[0] value
    160 	mov	($bp),$m0		# m0=bp[0]
    161 	mov	($ap),%rax
    162 
    163 	xor	$i,$i			# i=0
    164 	xor	$j,$j			# j=0
    165 
    166 	mov	$n0,$m1
    167 	mulq	$m0			# ap[0]*bp[0]
    168 	mov	%rax,$lo0
    169 	mov	($np),%rax
    170 
    171 	imulq	$lo0,$m1		# "tp[0]"*n0
    172 	mov	%rdx,$hi0
    173 
    174 	mulq	$m1			# np[0]*m1
    175 	add	%rax,$lo0		# discarded
    176 	mov	8($ap),%rax
    177 	adc	\$0,%rdx
    178 	mov	%rdx,$hi1
    179 
    180 	lea	1($j),$j		# j++
    181 	jmp	.L1st_enter
    182 
    183 .align	16
    184 .L1st:
    185 	add	%rax,$hi1
    186 	mov	($ap,$j,8),%rax
    187 	adc	\$0,%rdx
    188 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    189 	mov	$lo0,$hi0
    190 	adc	\$0,%rdx
    191 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    192 	mov	%rdx,$hi1
    193 
    194 .L1st_enter:
    195 	mulq	$m0			# ap[j]*bp[0]
    196 	add	%rax,$hi0
    197 	mov	($np,$j,8),%rax
    198 	adc	\$0,%rdx
    199 	lea	1($j),$j		# j++
    200 	mov	%rdx,$lo0
    201 
    202 	mulq	$m1			# np[j]*m1
    203 	cmp	$num,$j
    204 	jne	.L1st
    205 
    206 	add	%rax,$hi1
    207 	mov	($ap),%rax		# ap[0]
    208 	adc	\$0,%rdx
    209 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    210 	adc	\$0,%rdx
    211 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    212 	mov	%rdx,$hi1
    213 	mov	$lo0,$hi0
    214 
    215 	xor	%rdx,%rdx
    216 	add	$hi0,$hi1
    217 	adc	\$0,%rdx
    218 	mov	$hi1,-8(%rsp,$num,8)
    219 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    220 
    221 	lea	1($i),$i		# i++
    222 	jmp	.Louter
    223 .align	16
    224 .Louter:
    225 	mov	($bp,$i,8),$m0		# m0=bp[i]
    226 	xor	$j,$j			# j=0
    227 	mov	$n0,$m1
    228 	mov	(%rsp),$lo0
    229 	mulq	$m0			# ap[0]*bp[i]
    230 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    231 	mov	($np),%rax
    232 	adc	\$0,%rdx
    233 
    234 	imulq	$lo0,$m1		# tp[0]*n0
    235 	mov	%rdx,$hi0
    236 
    237 	mulq	$m1			# np[0]*m1
    238 	add	%rax,$lo0		# discarded
    239 	mov	8($ap),%rax
    240 	adc	\$0,%rdx
    241 	mov	8(%rsp),$lo0		# tp[1]
    242 	mov	%rdx,$hi1
    243 
    244 	lea	1($j),$j		# j++
    245 	jmp	.Linner_enter
    246 
    247 .align	16
    248 .Linner:
    249 	add	%rax,$hi1
    250 	mov	($ap,$j,8),%rax
    251 	adc	\$0,%rdx
    252 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    253 	mov	(%rsp,$j,8),$lo0
    254 	adc	\$0,%rdx
    255 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    256 	mov	%rdx,$hi1
    257 
    258 .Linner_enter:
    259 	mulq	$m0			# ap[j]*bp[i]
    260 	add	%rax,$hi0
    261 	mov	($np,$j,8),%rax
    262 	adc	\$0,%rdx
    263 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    264 	mov	%rdx,$hi0
    265 	adc	\$0,$hi0
    266 	lea	1($j),$j		# j++
    267 
    268 	mulq	$m1			# np[j]*m1
    269 	cmp	$num,$j
    270 	jne	.Linner
    271 
    272 	add	%rax,$hi1
    273 	mov	($ap),%rax		# ap[0]
    274 	adc	\$0,%rdx
    275 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    276 	mov	(%rsp,$j,8),$lo0
    277 	adc	\$0,%rdx
    278 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    279 	mov	%rdx,$hi1
    280 
    281 	xor	%rdx,%rdx
    282 	add	$hi0,$hi1
    283 	adc	\$0,%rdx
    284 	add	$lo0,$hi1		# pull upmost overflow bit
    285 	adc	\$0,%rdx
    286 	mov	$hi1,-8(%rsp,$num,8)
    287 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    288 
    289 	lea	1($i),$i		# i++
    290 	cmp	$num,$i
    291 	jb	.Louter
    292 
    293 	xor	$i,$i			# i=0 and clear CF!
    294 	mov	(%rsp),%rax		# tp[0]
    295 	lea	(%rsp),$ap		# borrow ap for tp
    296 	mov	$num,$j			# j=num
    297 	jmp	.Lsub
    298 .align	16
    299 .Lsub:
    300 	sbb	($np,$i,8),%rax
    301 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    302 	mov	8($ap,$i,8),%rax	# tp[i+1]
    303 	lea	1($i),$i		# i++
    304 	dec	$j			# doesnn't affect CF!
    305 	jnz	.Lsub
    306 
    307 	sbb	\$0,%rax		# handle upmost overflow bit
    308 	xor	$i,$i
    309 	and	%rax,$ap
    310 	not	%rax
    311 	mov	$rp,$np
    312 	and	%rax,$np
    313 	mov	$num,$j			# j=num
    314 	or	$np,$ap			# ap=borrow?tp:rp
    315 .align	16
    316 .Lcopy:					# copy or in-place refresh
    317 	mov	($ap,$i,8),%rax
    318 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    319 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    320 	lea	1($i),$i
    321 	sub	\$1,$j
    322 	jnz	.Lcopy
    323 
    324 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    325 .cfi_def_cfa	%rsi,8
    326 	mov	\$1,%rax
    327 	mov	-48(%rsi),%r15
    328 .cfi_restore	%r15
    329 	mov	-40(%rsi),%r14
    330 .cfi_restore	%r14
    331 	mov	-32(%rsi),%r13
    332 .cfi_restore	%r13
    333 	mov	-24(%rsi),%r12
    334 .cfi_restore	%r12
    335 	mov	-16(%rsi),%rbp
    336 .cfi_restore	%rbp
    337 	mov	-8(%rsi),%rbx
    338 .cfi_restore	%rbx
    339 	lea	(%rsi),%rsp
    340 .cfi_def_cfa_register	%rsp
    341 .Lmul_epilogue:
    342 	ret
    343 .cfi_endproc
    344 .size	bn_mul_mont,.-bn_mul_mont
    345 ___
    346 {{{
    347 my @A=("%r10","%r11");
    348 my @N=("%r13","%rdi");
    349 $code.=<<___;
    350 .type	bn_mul4x_mont,\@function,6
    351 .align	16
    352 bn_mul4x_mont:
    353 .cfi_startproc
    354 	mov	${num}d,${num}d
    355 	mov	%rsp,%rax
    356 .cfi_def_cfa_register	%rax
    357 .Lmul4x_enter:
    358 ___
    359 $code.=<<___ if ($addx);
    360 	and	\$0x80100,%r11d
    361 	cmp	\$0x80100,%r11d
    362 	je	.Lmulx4x_enter
    363 ___
    364 $code.=<<___;
    365 	push	%rbx
    366 .cfi_push	%rbx
    367 	push	%rbp
    368 .cfi_push	%rbp
    369 	push	%r12
    370 .cfi_push	%r12
    371 	push	%r13
    372 .cfi_push	%r13
    373 	push	%r14
    374 .cfi_push	%r14
    375 	push	%r15
    376 .cfi_push	%r15
    377 
    378 	neg	$num
    379 	mov	%rsp,%r11
    380 	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
    381 	neg	$num			# restore
    382 	and	\$-1024,%r10		# minimize TLB usage
    383 
    384 	sub	%r10,%r11
    385 	and	\$-4096,%r11
    386 	lea	(%r10,%r11),%rsp
    387 	mov	(%rsp),%r11
    388 	cmp	%r10,%rsp
    389 	ja	.Lmul4x_page_walk
    390 	jmp	.Lmul4x_page_walk_done
    391 
    392 .Lmul4x_page_walk:
    393 	lea	-4096(%rsp),%rsp
    394 	mov	(%rsp),%r11
    395 	cmp	%r10,%rsp
    396 	ja	.Lmul4x_page_walk
    397 .Lmul4x_page_walk_done:
    398 
    399 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    400 .cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
    401 .Lmul4x_body:
    402 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    403 	mov	%rdx,%r12		# reassign $bp
    404 ___
    405 		$bp="%r12";
    406 $code.=<<___;
    407 	mov	($n0),$n0		# pull n0[0] value
    408 	mov	($bp),$m0		# m0=bp[0]
    409 	mov	($ap),%rax
    410 
    411 	xor	$i,$i			# i=0
    412 	xor	$j,$j			# j=0
    413 
    414 	mov	$n0,$m1
    415 	mulq	$m0			# ap[0]*bp[0]
    416 	mov	%rax,$A[0]
    417 	mov	($np),%rax
    418 
    419 	imulq	$A[0],$m1		# "tp[0]"*n0
    420 	mov	%rdx,$A[1]
    421 
    422 	mulq	$m1			# np[0]*m1
    423 	add	%rax,$A[0]		# discarded
    424 	mov	8($ap),%rax
    425 	adc	\$0,%rdx
    426 	mov	%rdx,$N[1]
    427 
    428 	mulq	$m0
    429 	add	%rax,$A[1]
    430 	mov	8($np),%rax
    431 	adc	\$0,%rdx
    432 	mov	%rdx,$A[0]
    433 
    434 	mulq	$m1
    435 	add	%rax,$N[1]
    436 	mov	16($ap),%rax
    437 	adc	\$0,%rdx
    438 	add	$A[1],$N[1]
    439 	lea	4($j),$j		# j++
    440 	adc	\$0,%rdx
    441 	mov	$N[1],(%rsp)
    442 	mov	%rdx,$N[0]
    443 	jmp	.L1st4x
    444 .align	16
    445 .L1st4x:
    446 	mulq	$m0			# ap[j]*bp[0]
    447 	add	%rax,$A[0]
    448 	mov	-16($np,$j,8),%rax
    449 	adc	\$0,%rdx
    450 	mov	%rdx,$A[1]
    451 
    452 	mulq	$m1			# np[j]*m1
    453 	add	%rax,$N[0]
    454 	mov	-8($ap,$j,8),%rax
    455 	adc	\$0,%rdx
    456 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    457 	adc	\$0,%rdx
    458 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    459 	mov	%rdx,$N[1]
    460 
    461 	mulq	$m0			# ap[j]*bp[0]
    462 	add	%rax,$A[1]
    463 	mov	-8($np,$j,8),%rax
    464 	adc	\$0,%rdx
    465 	mov	%rdx,$A[0]
    466 
    467 	mulq	$m1			# np[j]*m1
    468 	add	%rax,$N[1]
    469 	mov	($ap,$j,8),%rax
    470 	adc	\$0,%rdx
    471 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    472 	adc	\$0,%rdx
    473 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    474 	mov	%rdx,$N[0]
    475 
    476 	mulq	$m0			# ap[j]*bp[0]
    477 	add	%rax,$A[0]
    478 	mov	($np,$j,8),%rax
    479 	adc	\$0,%rdx
    480 	mov	%rdx,$A[1]
    481 
    482 	mulq	$m1			# np[j]*m1
    483 	add	%rax,$N[0]
    484 	mov	8($ap,$j,8),%rax
    485 	adc	\$0,%rdx
    486 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    487 	adc	\$0,%rdx
    488 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    489 	mov	%rdx,$N[1]
    490 
    491 	mulq	$m0			# ap[j]*bp[0]
    492 	add	%rax,$A[1]
    493 	mov	8($np,$j,8),%rax
    494 	adc	\$0,%rdx
    495 	lea	4($j),$j		# j++
    496 	mov	%rdx,$A[0]
    497 
    498 	mulq	$m1			# np[j]*m1
    499 	add	%rax,$N[1]
    500 	mov	-16($ap,$j,8),%rax
    501 	adc	\$0,%rdx
    502 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    503 	adc	\$0,%rdx
    504 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    505 	mov	%rdx,$N[0]
    506 	cmp	$num,$j
    507 	jb	.L1st4x
    508 
    509 	mulq	$m0			# ap[j]*bp[0]
    510 	add	%rax,$A[0]
    511 	mov	-16($np,$j,8),%rax
    512 	adc	\$0,%rdx
    513 	mov	%rdx,$A[1]
    514 
    515 	mulq	$m1			# np[j]*m1
    516 	add	%rax,$N[0]
    517 	mov	-8($ap,$j,8),%rax
    518 	adc	\$0,%rdx
    519 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    520 	adc	\$0,%rdx
    521 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    522 	mov	%rdx,$N[1]
    523 
    524 	mulq	$m0			# ap[j]*bp[0]
    525 	add	%rax,$A[1]
    526 	mov	-8($np,$j,8),%rax
    527 	adc	\$0,%rdx
    528 	mov	%rdx,$A[0]
    529 
    530 	mulq	$m1			# np[j]*m1
    531 	add	%rax,$N[1]
    532 	mov	($ap),%rax		# ap[0]
    533 	adc	\$0,%rdx
    534 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    535 	adc	\$0,%rdx
    536 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    537 	mov	%rdx,$N[0]
    538 
    539 	xor	$N[1],$N[1]
    540 	add	$A[0],$N[0]
    541 	adc	\$0,$N[1]
    542 	mov	$N[0],-8(%rsp,$j,8)
    543 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    544 
    545 	lea	1($i),$i		# i++
    546 .align	4
    547 .Louter4x:
    548 	mov	($bp,$i,8),$m0		# m0=bp[i]
    549 	xor	$j,$j			# j=0
    550 	mov	(%rsp),$A[0]
    551 	mov	$n0,$m1
    552 	mulq	$m0			# ap[0]*bp[i]
    553 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    554 	mov	($np),%rax
    555 	adc	\$0,%rdx
    556 
    557 	imulq	$A[0],$m1		# tp[0]*n0
    558 	mov	%rdx,$A[1]
    559 
    560 	mulq	$m1			# np[0]*m1
    561 	add	%rax,$A[0]		# "$N[0]", discarded
    562 	mov	8($ap),%rax
    563 	adc	\$0,%rdx
    564 	mov	%rdx,$N[1]
    565 
    566 	mulq	$m0			# ap[j]*bp[i]
    567 	add	%rax,$A[1]
    568 	mov	8($np),%rax
    569 	adc	\$0,%rdx
    570 	add	8(%rsp),$A[1]		# +tp[1]
    571 	adc	\$0,%rdx
    572 	mov	%rdx,$A[0]
    573 
    574 	mulq	$m1			# np[j]*m1
    575 	add	%rax,$N[1]
    576 	mov	16($ap),%rax
    577 	adc	\$0,%rdx
    578 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    579 	lea	4($j),$j		# j+=2
    580 	adc	\$0,%rdx
    581 	mov	$N[1],(%rsp)		# tp[j-1]
    582 	mov	%rdx,$N[0]
    583 	jmp	.Linner4x
    584 .align	16
    585 .Linner4x:
    586 	mulq	$m0			# ap[j]*bp[i]
    587 	add	%rax,$A[0]
    588 	mov	-16($np,$j,8),%rax
    589 	adc	\$0,%rdx
    590 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    591 	adc	\$0,%rdx
    592 	mov	%rdx,$A[1]
    593 
    594 	mulq	$m1			# np[j]*m1
    595 	add	%rax,$N[0]
    596 	mov	-8($ap,$j,8),%rax
    597 	adc	\$0,%rdx
    598 	add	$A[0],$N[0]
    599 	adc	\$0,%rdx
    600 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    601 	mov	%rdx,$N[1]
    602 
    603 	mulq	$m0			# ap[j]*bp[i]
    604 	add	%rax,$A[1]
    605 	mov	-8($np,$j,8),%rax
    606 	adc	\$0,%rdx
    607 	add	-8(%rsp,$j,8),$A[1]
    608 	adc	\$0,%rdx
    609 	mov	%rdx,$A[0]
    610 
    611 	mulq	$m1			# np[j]*m1
    612 	add	%rax,$N[1]
    613 	mov	($ap,$j,8),%rax
    614 	adc	\$0,%rdx
    615 	add	$A[1],$N[1]
    616 	adc	\$0,%rdx
    617 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    618 	mov	%rdx,$N[0]
    619 
    620 	mulq	$m0			# ap[j]*bp[i]
    621 	add	%rax,$A[0]
    622 	mov	($np,$j,8),%rax
    623 	adc	\$0,%rdx
    624 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    625 	adc	\$0,%rdx
    626 	mov	%rdx,$A[1]
    627 
    628 	mulq	$m1			# np[j]*m1
    629 	add	%rax,$N[0]
    630 	mov	8($ap,$j,8),%rax
    631 	adc	\$0,%rdx
    632 	add	$A[0],$N[0]
    633 	adc	\$0,%rdx
    634 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    635 	mov	%rdx,$N[1]
    636 
    637 	mulq	$m0			# ap[j]*bp[i]
    638 	add	%rax,$A[1]
    639 	mov	8($np,$j,8),%rax
    640 	adc	\$0,%rdx
    641 	add	8(%rsp,$j,8),$A[1]
    642 	adc	\$0,%rdx
    643 	lea	4($j),$j		# j++
    644 	mov	%rdx,$A[0]
    645 
    646 	mulq	$m1			# np[j]*m1
    647 	add	%rax,$N[1]
    648 	mov	-16($ap,$j,8),%rax
    649 	adc	\$0,%rdx
    650 	add	$A[1],$N[1]
    651 	adc	\$0,%rdx
    652 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    653 	mov	%rdx,$N[0]
    654 	cmp	$num,$j
    655 	jb	.Linner4x
    656 
    657 	mulq	$m0			# ap[j]*bp[i]
    658 	add	%rax,$A[0]
    659 	mov	-16($np,$j,8),%rax
    660 	adc	\$0,%rdx
    661 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    662 	adc	\$0,%rdx
    663 	mov	%rdx,$A[1]
    664 
    665 	mulq	$m1			# np[j]*m1
    666 	add	%rax,$N[0]
    667 	mov	-8($ap,$j,8),%rax
    668 	adc	\$0,%rdx
    669 	add	$A[0],$N[0]
    670 	adc	\$0,%rdx
    671 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    672 	mov	%rdx,$N[1]
    673 
    674 	mulq	$m0			# ap[j]*bp[i]
    675 	add	%rax,$A[1]
    676 	mov	-8($np,$j,8),%rax
    677 	adc	\$0,%rdx
    678 	add	-8(%rsp,$j,8),$A[1]
    679 	adc	\$0,%rdx
    680 	lea	1($i),$i		# i++
    681 	mov	%rdx,$A[0]
    682 
    683 	mulq	$m1			# np[j]*m1
    684 	add	%rax,$N[1]
    685 	mov	($ap),%rax		# ap[0]
    686 	adc	\$0,%rdx
    687 	add	$A[1],$N[1]
    688 	adc	\$0,%rdx
    689 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    690 	mov	%rdx,$N[0]
    691 
    692 	xor	$N[1],$N[1]
    693 	add	$A[0],$N[0]
    694 	adc	\$0,$N[1]
    695 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    696 	adc	\$0,$N[1]
    697 	mov	$N[0],-8(%rsp,$j,8)
    698 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    699 
    700 	cmp	$num,$i
    701 	jb	.Louter4x
    702 ___
    703 {
    704 my @ri=("%rax","%rdx",$m0,$m1);
    705 $code.=<<___;
    706 	mov	16(%rsp,$num,8),$rp	# restore $rp
    707 	lea	-4($num),$j
    708 	mov	0(%rsp),@ri[0]		# tp[0]
    709 	pxor	%xmm0,%xmm0
    710 	mov	8(%rsp),@ri[1]		# tp[1]
    711 	shr	\$2,$j			# j=num/4-1
    712 	lea	(%rsp),$ap		# borrow ap for tp
    713 	xor	$i,$i			# i=0 and clear CF!
    714 
    715 	sub	0($np),@ri[0]
    716 	mov	16($ap),@ri[2]		# tp[2]
    717 	mov	24($ap),@ri[3]		# tp[3]
    718 	sbb	8($np),@ri[1]
    719 	jmp	.Lsub4x
    720 .align	16
    721 .Lsub4x:
    722 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    723 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    724 	sbb	16($np,$i,8),@ri[2]
    725 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    726 	mov	40($ap,$i,8),@ri[1]
    727 	sbb	24($np,$i,8),@ri[3]
    728 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    729 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    730 	sbb	32($np,$i,8),@ri[0]
    731 	mov	48($ap,$i,8),@ri[2]
    732 	mov	56($ap,$i,8),@ri[3]
    733 	sbb	40($np,$i,8),@ri[1]
    734 	lea	4($i),$i		# i++
    735 	dec	$j			# doesnn't affect CF!
    736 	jnz	.Lsub4x
    737 
    738 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    739 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    740 	sbb	16($np,$i,8),@ri[2]
    741 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    742 	sbb	24($np,$i,8),@ri[3]
    743 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    744 
    745 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    746 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    747 	xor	$i,$i			# i=0
    748 	and	@ri[0],$ap
    749 	not	@ri[0]
    750 	mov	$rp,$np
    751 	and	@ri[0],$np
    752 	lea	-4($num),$j
    753 	or	$np,$ap			# ap=borrow?tp:rp
    754 	shr	\$2,$j			# j=num/4-1
    755 
    756 	movdqu	($ap),%xmm1
    757 	movdqa	%xmm0,(%rsp)
    758 	movdqu	%xmm1,($rp)
    759 	jmp	.Lcopy4x
    760 .align	16
    761 .Lcopy4x:					# copy or in-place refresh
    762 	movdqu	16($ap,$i),%xmm2
    763 	movdqu	32($ap,$i),%xmm1
    764 	movdqa	%xmm0,16(%rsp,$i)
    765 	movdqu	%xmm2,16($rp,$i)
    766 	movdqa	%xmm0,32(%rsp,$i)
    767 	movdqu	%xmm1,32($rp,$i)
    768 	lea	32($i),$i
    769 	dec	$j
    770 	jnz	.Lcopy4x
    771 
    772 	movdqu	16($ap,$i),%xmm2
    773 	movdqa	%xmm0,16(%rsp,$i)
    774 	movdqu	%xmm2,16($rp,$i)
    775 ___
    776 }
    777 $code.=<<___;
    778 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    779 .cfi_def_cfa	%rsi, 8
    780 	mov	\$1,%rax
    781 	mov	-48(%rsi),%r15
    782 .cfi_restore	%r15
    783 	mov	-40(%rsi),%r14
    784 .cfi_restore	%r14
    785 	mov	-32(%rsi),%r13
    786 .cfi_restore	%r13
    787 	mov	-24(%rsi),%r12
    788 .cfi_restore	%r12
    789 	mov	-16(%rsi),%rbp
    790 .cfi_restore	%rbp
    791 	mov	-8(%rsi),%rbx
    792 .cfi_restore	%rbx
    793 	lea	(%rsi),%rsp
    794 .cfi_def_cfa_register	%rsp
    795 .Lmul4x_epilogue:
    796 	ret
    797 .cfi_endproc
    798 .size	bn_mul4x_mont,.-bn_mul4x_mont
    799 ___
    800 }}}
    801 {{{
    803 ######################################################################
    804 # void bn_sqr8x_mont(
    805 my $rptr="%rdi";	# const BN_ULONG *rptr,
    806 my $aptr="%rsi";	# const BN_ULONG *aptr,
    807 my $bptr="%rdx";	# not used
    808 my $nptr="%rcx";	# const BN_ULONG *nptr,
    809 my $n0  ="%r8";		# const BN_ULONG *n0);
    810 my $num ="%r9";		# int num, has to be divisible by 8
    811 
    812 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    813 my @A0=("%r10","%r11");
    814 my @A1=("%r12","%r13");
    815 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    816 
    817 $code.=<<___	if ($addx);
    818 .extern	bn_sqrx8x_internal		# see x86_64-mont5 module
    819 ___
    820 $code.=<<___;
    821 .extern	bn_sqr8x_internal		# see x86_64-mont5 module
    822 
    823 .type	bn_sqr8x_mont,\@function,6
    824 .align	32
    825 bn_sqr8x_mont:
    826 .cfi_startproc
    827 	mov	%rsp,%rax
    828 .cfi_def_cfa_register	%rax
    829 .Lsqr8x_enter:
    830 	push	%rbx
    831 .cfi_push	%rbx
    832 	push	%rbp
    833 .cfi_push	%rbp
    834 	push	%r12
    835 .cfi_push	%r12
    836 	push	%r13
    837 .cfi_push	%r13
    838 	push	%r14
    839 .cfi_push	%r14
    840 	push	%r15
    841 .cfi_push	%r15
    842 .Lsqr8x_prologue:
    843 
    844 	mov	${num}d,%r10d
    845 	shl	\$3,${num}d		# convert $num to bytes
    846 	shl	\$3+2,%r10		# 4*$num
    847 	neg	$num
    848 
    849 	##############################################################
    850 	# ensure that stack frame doesn't alias with $aptr modulo
    851 	# 4096. this is done to allow memory disambiguation logic
    852 	# do its job.
    853 	#
    854 	lea	-64(%rsp,$num,2),%r11
    855 	mov	%rsp,%rbp
    856 	mov	($n0),$n0		# *n0
    857 	sub	$aptr,%r11
    858 	and	\$4095,%r11
    859 	cmp	%r11,%r10
    860 	jb	.Lsqr8x_sp_alt
    861 	sub	%r11,%rbp		# align with $aptr
    862 	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
    863 	jmp	.Lsqr8x_sp_done
    864 
    865 .align	32
    866 .Lsqr8x_sp_alt:
    867 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
    868 	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
    869 	sub	%r10,%r11
    870 	mov	\$0,%r10
    871 	cmovc	%r10,%r11
    872 	sub	%r11,%rbp
    873 .Lsqr8x_sp_done:
    874 	and	\$-64,%rbp
    875 	mov	%rsp,%r11
    876 	sub	%rbp,%r11
    877 	and	\$-4096,%r11
    878 	lea	(%rbp,%r11),%rsp
    879 	mov	(%rsp),%r10
    880 	cmp	%rbp,%rsp
    881 	ja	.Lsqr8x_page_walk
    882 	jmp	.Lsqr8x_page_walk_done
    883 
    884 .align	16
    885 .Lsqr8x_page_walk:
    886 	lea	-4096(%rsp),%rsp
    887 	mov	(%rsp),%r10
    888 	cmp	%rbp,%rsp
    889 	ja	.Lsqr8x_page_walk
    890 .Lsqr8x_page_walk_done:
    891 
    892 	mov	$num,%r10
    893 	neg	$num
    894 
    895 	mov	$n0,  32(%rsp)
    896 	mov	%rax, 40(%rsp)		# save original %rsp
    897 .cfi_cfa_expression	%rsp+40,deref,+8
    898 .Lsqr8x_body:
    899 
    900 	movq	$nptr, %xmm2		# save pointer to modulus
    901 	pxor	%xmm0,%xmm0
    902 	movq	$rptr,%xmm1		# save $rptr
    903 	movq	%r10, %xmm3		# -$num
    904 ___
    905 $code.=<<___ if ($addx);
    906 	leaq	OPENSSL_ia32cap_P(%rip),%rax
    907 	mov	8(%rax),%eax
    908 	and	\$0x80100,%eax
    909 	cmp	\$0x80100,%eax
    910 	jne	.Lsqr8x_nox
    911 
    912 	call	bn_sqrx8x_internal	# see x86_64-mont5 module
    913 					# %rax	top-most carry
    914 					# %rbp	nptr
    915 					# %rcx	-8*num
    916 					# %r8	end of tp[2*num]
    917 	lea	(%r8,%rcx),%rbx
    918 	mov	%rcx,$num
    919 	mov	%rcx,%rdx
    920 	movq	%xmm1,$rptr
    921 	sar	\$3+2,%rcx		# %cf=0
    922 	jmp	.Lsqr8x_sub
    923 
    924 .align	32
    925 .Lsqr8x_nox:
    926 ___
    927 $code.=<<___;
    928 	call	bn_sqr8x_internal	# see x86_64-mont5 module
    929 					# %rax	top-most carry
    930 					# %rbp	nptr
    931 					# %r8	-8*num
    932 					# %rdi	end of tp[2*num]
    933 	lea	(%rdi,$num),%rbx
    934 	mov	$num,%rcx
    935 	mov	$num,%rdx
    936 	movq	%xmm1,$rptr
    937 	sar	\$3+2,%rcx		# %cf=0
    938 	jmp	.Lsqr8x_sub
    939 
    940 .align	32
    941 .Lsqr8x_sub:
    942 	mov	8*0(%rbx),%r12
    943 	mov	8*1(%rbx),%r13
    944 	mov	8*2(%rbx),%r14
    945 	mov	8*3(%rbx),%r15
    946 	lea	8*4(%rbx),%rbx
    947 	sbb	8*0(%rbp),%r12
    948 	sbb	8*1(%rbp),%r13
    949 	sbb	8*2(%rbp),%r14
    950 	sbb	8*3(%rbp),%r15
    951 	lea	8*4(%rbp),%rbp
    952 	mov	%r12,8*0($rptr)
    953 	mov	%r13,8*1($rptr)
    954 	mov	%r14,8*2($rptr)
    955 	mov	%r15,8*3($rptr)
    956 	lea	8*4($rptr),$rptr
    957 	inc	%rcx			# preserves %cf
    958 	jnz	.Lsqr8x_sub
    959 
    960 	sbb	\$0,%rax		# top-most carry
    961 	lea	(%rbx,$num),%rbx	# rewind
    962 	lea	($rptr,$num),$rptr	# rewind
    963 
    964 	movq	%rax,%xmm1
    965 	pxor	%xmm0,%xmm0
    966 	pshufd	\$0,%xmm1,%xmm1
    967 	mov	40(%rsp),%rsi		# restore %rsp
    968 .cfi_def_cfa	%rsi,8
    969 	jmp	.Lsqr8x_cond_copy
    970 
    971 .align	32
    972 .Lsqr8x_cond_copy:
    973 	movdqa	16*0(%rbx),%xmm2
    974 	movdqa	16*1(%rbx),%xmm3
    975 	lea	16*2(%rbx),%rbx
    976 	movdqu	16*0($rptr),%xmm4
    977 	movdqu	16*1($rptr),%xmm5
    978 	lea	16*2($rptr),$rptr
    979 	movdqa	%xmm0,-16*2(%rbx)	# zero tp
    980 	movdqa	%xmm0,-16*1(%rbx)
    981 	movdqa	%xmm0,-16*2(%rbx,%rdx)
    982 	movdqa	%xmm0,-16*1(%rbx,%rdx)
    983 	pcmpeqd	%xmm1,%xmm0
    984 	pand	%xmm1,%xmm2
    985 	pand	%xmm1,%xmm3
    986 	pand	%xmm0,%xmm4
    987 	pand	%xmm0,%xmm5
    988 	pxor	%xmm0,%xmm0
    989 	por	%xmm2,%xmm4
    990 	por	%xmm3,%xmm5
    991 	movdqu	%xmm4,-16*2($rptr)
    992 	movdqu	%xmm5,-16*1($rptr)
    993 	add	\$32,$num
    994 	jnz	.Lsqr8x_cond_copy
    995 
    996 	mov	\$1,%rax
    997 	mov	-48(%rsi),%r15
    998 .cfi_restore	%r15
    999 	mov	-40(%rsi),%r14
   1000 .cfi_restore	%r14
   1001 	mov	-32(%rsi),%r13
   1002 .cfi_restore	%r13
   1003 	mov	-24(%rsi),%r12
   1004 .cfi_restore	%r12
   1005 	mov	-16(%rsi),%rbp
   1006 .cfi_restore	%rbp
   1007 	mov	-8(%rsi),%rbx
   1008 .cfi_restore	%rbx
   1009 	lea	(%rsi),%rsp
   1010 .cfi_def_cfa_register	%rsp
   1011 .Lsqr8x_epilogue:
   1012 	ret
   1013 .cfi_endproc
   1014 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
   1015 ___
   1016 }}}
   1017 
   1019 if ($addx) {{{
   1020 my $bp="%rdx";	# original value
   1021 
   1022 $code.=<<___;
   1023 .type	bn_mulx4x_mont,\@function,6
   1024 .align	32
   1025 bn_mulx4x_mont:
   1026 .cfi_startproc
   1027 	mov	%rsp,%rax
   1028 .cfi_def_cfa_register	%rax
   1029 .Lmulx4x_enter:
   1030 	push	%rbx
   1031 .cfi_push	%rbx
   1032 	push	%rbp
   1033 .cfi_push	%rbp
   1034 	push	%r12
   1035 .cfi_push	%r12
   1036 	push	%r13
   1037 .cfi_push	%r13
   1038 	push	%r14
   1039 .cfi_push	%r14
   1040 	push	%r15
   1041 .cfi_push	%r15
   1042 .Lmulx4x_prologue:
   1043 
   1044 	shl	\$3,${num}d		# convert $num to bytes
   1045 	xor	%r10,%r10
   1046 	sub	$num,%r10		# -$num
   1047 	mov	($n0),$n0		# *n0
   1048 	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
   1049 	and	\$-128,%rbp
   1050 	mov	%rsp,%r11
   1051 	sub	%rbp,%r11
   1052 	and	\$-4096,%r11
   1053 	lea	(%rbp,%r11),%rsp
   1054 	mov	(%rsp),%r10
   1055 	cmp	%rbp,%rsp
   1056 	ja	.Lmulx4x_page_walk
   1057 	jmp	.Lmulx4x_page_walk_done
   1058 
   1059 .align	16
   1060 .Lmulx4x_page_walk:
   1061 	lea	-4096(%rsp),%rsp
   1062 	mov	(%rsp),%r10
   1063 	cmp	%rbp,%rsp
   1064 	ja	.Lmulx4x_page_walk
   1065 .Lmulx4x_page_walk_done:
   1066 
   1067 	lea	($bp,$num),%r10
   1068 	##############################################################
   1069 	# Stack layout
   1070 	# +0	num
   1071 	# +8	off-loaded &b[i]
   1072 	# +16	end of b[num]
   1073 	# +24	saved n0
   1074 	# +32	saved rp
   1075 	# +40	saved %rsp
   1076 	# +48	inner counter
   1077 	# +56
   1078 	# +64	tmp[num+1]
   1079 	#
   1080 	mov	$num,0(%rsp)		# save $num
   1081 	shr	\$5,$num
   1082 	mov	%r10,16(%rsp)		# end of b[num]
   1083 	sub	\$1,$num
   1084 	mov	$n0, 24(%rsp)		# save *n0
   1085 	mov	$rp, 32(%rsp)		# save $rp
   1086 	mov	%rax,40(%rsp)		# save original %rsp
   1087 .cfi_cfa_expression	%rsp+40,deref,+8
   1088 	mov	$num,48(%rsp)		# inner counter
   1089 	jmp	.Lmulx4x_body
   1090 
   1091 .align	32
   1092 .Lmulx4x_body:
   1093 ___
   1094 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
   1095    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
   1096 my $rptr=$bptr;
   1097 $code.=<<___;
   1098 	lea	8($bp),$bptr
   1099 	mov	($bp),%rdx		# b[0], $bp==%rdx actually
   1100 	lea	64+32(%rsp),$tptr
   1101 	mov	%rdx,$bi
   1102 
   1103 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
   1104 	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
   1105 	add	%rax,%r11
   1106 	mov	$bptr,8(%rsp)		# off-load &b[i]
   1107 	mulx	2*8($aptr),%r12,%r13	# ...
   1108 	adc	%r14,%r12
   1109 	adc	\$0,%r13
   1110 
   1111 	mov	$mi,$bptr		# borrow $bptr
   1112 	imulq	24(%rsp),$mi		# "t[0]"*n0
   1113 	xor	$zero,$zero		# cf=0, of=0
   1114 
   1115 	mulx	3*8($aptr),%rax,%r14
   1116 	 mov	$mi,%rdx
   1117 	lea	4*8($aptr),$aptr
   1118 	adcx	%rax,%r13
   1119 	adcx	$zero,%r14		# cf=0
   1120 
   1121 	mulx	0*8($nptr),%rax,%r10
   1122 	adcx	%rax,$bptr		# discarded
   1123 	adox	%r11,%r10
   1124 	mulx	1*8($nptr),%rax,%r11
   1125 	adcx	%rax,%r10
   1126 	adox	%r12,%r11
   1127 	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
   1128 	mov	48(%rsp),$bptr		# counter value
   1129 	mov	%r10,-4*8($tptr)
   1130 	adcx	%rax,%r11
   1131 	adox	%r13,%r12
   1132 	mulx	3*8($nptr),%rax,%r15
   1133 	 mov	$bi,%rdx
   1134 	mov	%r11,-3*8($tptr)
   1135 	adcx	%rax,%r12
   1136 	adox	$zero,%r15		# of=0
   1137 	lea	4*8($nptr),$nptr
   1138 	mov	%r12,-2*8($tptr)
   1139 
   1140 	jmp	.Lmulx4x_1st
   1141 
   1142 .align	32
   1143 .Lmulx4x_1st:
   1144 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   1145 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
   1146 	adcx	%r14,%r10
   1147 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
   1148 	adcx	%rax,%r11
   1149 	mulx	2*8($aptr),%r12,%rax	# ...
   1150 	adcx	%r14,%r12
   1151 	mulx	3*8($aptr),%r13,%r14
   1152 	 .byte	0x67,0x67
   1153 	 mov	$mi,%rdx
   1154 	adcx	%rax,%r13
   1155 	adcx	$zero,%r14		# cf=0
   1156 	lea	4*8($aptr),$aptr
   1157 	lea	4*8($tptr),$tptr
   1158 
   1159 	adox	%r15,%r10
   1160 	mulx	0*8($nptr),%rax,%r15
   1161 	adcx	%rax,%r10
   1162 	adox	%r15,%r11
   1163 	mulx	1*8($nptr),%rax,%r15
   1164 	adcx	%rax,%r11
   1165 	adox	%r15,%r12
   1166 	mulx	2*8($nptr),%rax,%r15
   1167 	mov	%r10,-5*8($tptr)
   1168 	adcx	%rax,%r12
   1169 	mov	%r11,-4*8($tptr)
   1170 	adox	%r15,%r13
   1171 	mulx	3*8($nptr),%rax,%r15
   1172 	 mov	$bi,%rdx
   1173 	mov	%r12,-3*8($tptr)
   1174 	adcx	%rax,%r13
   1175 	adox	$zero,%r15
   1176 	lea	4*8($nptr),$nptr
   1177 	mov	%r13,-2*8($tptr)
   1178 
   1179 	dec	$bptr			# of=0, pass cf
   1180 	jnz	.Lmulx4x_1st
   1181 
   1182 	mov	0(%rsp),$num		# load num
   1183 	mov	8(%rsp),$bptr		# re-load &b[i]
   1184 	adc	$zero,%r15		# modulo-scheduled
   1185 	add	%r15,%r14
   1186 	sbb	%r15,%r15		# top-most carry
   1187 	mov	%r14,-1*8($tptr)
   1188 	jmp	.Lmulx4x_outer
   1189 
   1190 .align	32
   1191 .Lmulx4x_outer:
   1192 	mov	($bptr),%rdx		# b[i]
   1193 	lea	8($bptr),$bptr		# b++
   1194 	sub	$num,$aptr		# rewind $aptr
   1195 	mov	%r15,($tptr)		# save top-most carry
   1196 	lea	64+4*8(%rsp),$tptr
   1197 	sub	$num,$nptr		# rewind $nptr
   1198 
   1199 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
   1200 	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
   1201 	mov	%rdx,$bi
   1202 	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
   1203 	adox	-4*8($tptr),$mi
   1204 	adcx	%r14,%r11
   1205 	mulx	2*8($aptr),%r15,%r13	# ...
   1206 	adox	-3*8($tptr),%r11
   1207 	adcx	%r15,%r12
   1208 	adox	-2*8($tptr),%r12
   1209 	adcx	$zero,%r13
   1210 	adox	$zero,%r13
   1211 
   1212 	mov	$bptr,8(%rsp)		# off-load &b[i]
   1213 	mov	$mi,%r15
   1214 	imulq	24(%rsp),$mi		# "t[0]"*n0
   1215 	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
   1216 
   1217 	mulx	3*8($aptr),%rax,%r14
   1218 	 mov	$mi,%rdx
   1219 	adcx	%rax,%r13
   1220 	adox	-1*8($tptr),%r13
   1221 	adcx	$zero,%r14
   1222 	lea	4*8($aptr),$aptr
   1223 	adox	$zero,%r14
   1224 
   1225 	mulx	0*8($nptr),%rax,%r10
   1226 	adcx	%rax,%r15		# discarded
   1227 	adox	%r11,%r10
   1228 	mulx	1*8($nptr),%rax,%r11
   1229 	adcx	%rax,%r10
   1230 	adox	%r12,%r11
   1231 	mulx	2*8($nptr),%rax,%r12
   1232 	mov	%r10,-4*8($tptr)
   1233 	adcx	%rax,%r11
   1234 	adox	%r13,%r12
   1235 	mulx	3*8($nptr),%rax,%r15
   1236 	 mov	$bi,%rdx
   1237 	mov	%r11,-3*8($tptr)
   1238 	lea	4*8($nptr),$nptr
   1239 	adcx	%rax,%r12
   1240 	adox	$zero,%r15		# of=0
   1241 	mov	48(%rsp),$bptr		# counter value
   1242 	mov	%r12,-2*8($tptr)
   1243 
   1244 	jmp	.Lmulx4x_inner
   1245 
   1246 .align	32
   1247 .Lmulx4x_inner:
   1248 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
   1249 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   1250 	adox	%r14,%r10
   1251 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
   1252 	adcx	0*8($tptr),%r10
   1253 	adox	%rax,%r11
   1254 	mulx	2*8($aptr),%r12,%rax	# ...
   1255 	adcx	1*8($tptr),%r11
   1256 	adox	%r14,%r12
   1257 	mulx	3*8($aptr),%r13,%r14
   1258 	 mov	$mi,%rdx
   1259 	adcx	2*8($tptr),%r12
   1260 	adox	%rax,%r13
   1261 	adcx	3*8($tptr),%r13
   1262 	adox	$zero,%r14		# of=0
   1263 	lea	4*8($aptr),$aptr
   1264 	lea	4*8($tptr),$tptr
   1265 	adcx	$zero,%r14		# cf=0
   1266 
   1267 	adox	%r15,%r10
   1268 	mulx	0*8($nptr),%rax,%r15
   1269 	adcx	%rax,%r10
   1270 	adox	%r15,%r11
   1271 	mulx	1*8($nptr),%rax,%r15
   1272 	adcx	%rax,%r11
   1273 	adox	%r15,%r12
   1274 	mulx	2*8($nptr),%rax,%r15
   1275 	mov	%r10,-5*8($tptr)
   1276 	adcx	%rax,%r12
   1277 	adox	%r15,%r13
   1278 	mulx	3*8($nptr),%rax,%r15
   1279 	 mov	$bi,%rdx
   1280 	mov	%r11,-4*8($tptr)
   1281 	mov	%r12,-3*8($tptr)
   1282 	adcx	%rax,%r13
   1283 	adox	$zero,%r15
   1284 	lea	4*8($nptr),$nptr
   1285 	mov	%r13,-2*8($tptr)
   1286 
   1287 	dec	$bptr			# of=0, pass cf
   1288 	jnz	.Lmulx4x_inner
   1289 
   1290 	mov	0(%rsp),$num		# load num
   1291 	mov	8(%rsp),$bptr		# re-load &b[i]
   1292 	adc	$zero,%r15		# modulo-scheduled
   1293 	sub	0*8($tptr),$zero	# pull top-most carry
   1294 	adc	%r15,%r14
   1295 	sbb	%r15,%r15		# top-most carry
   1296 	mov	%r14,-1*8($tptr)
   1297 
   1298 	cmp	16(%rsp),$bptr
   1299 	jne	.Lmulx4x_outer
   1300 
   1301 	lea	64(%rsp),$tptr
   1302 	sub	$num,$nptr		# rewind $nptr
   1303 	neg	%r15
   1304 	mov	$num,%rdx
   1305 	shr	\$3+2,$num		# %cf=0
   1306 	mov	32(%rsp),$rptr		# restore rp
   1307 	jmp	.Lmulx4x_sub
   1308 
   1309 .align	32
   1310 .Lmulx4x_sub:
   1311 	mov	8*0($tptr),%r11
   1312 	mov	8*1($tptr),%r12
   1313 	mov	8*2($tptr),%r13
   1314 	mov	8*3($tptr),%r14
   1315 	lea	8*4($tptr),$tptr
   1316 	sbb	8*0($nptr),%r11
   1317 	sbb	8*1($nptr),%r12
   1318 	sbb	8*2($nptr),%r13
   1319 	sbb	8*3($nptr),%r14
   1320 	lea	8*4($nptr),$nptr
   1321 	mov	%r11,8*0($rptr)
   1322 	mov	%r12,8*1($rptr)
   1323 	mov	%r13,8*2($rptr)
   1324 	mov	%r14,8*3($rptr)
   1325 	lea	8*4($rptr),$rptr
   1326 	dec	$num			# preserves %cf
   1327 	jnz	.Lmulx4x_sub
   1328 
   1329 	sbb	\$0,%r15		# top-most carry
   1330 	lea	64(%rsp),$tptr
   1331 	sub	%rdx,$rptr		# rewind
   1332 
   1333 	movq	%r15,%xmm1
   1334 	pxor	%xmm0,%xmm0
   1335 	pshufd	\$0,%xmm1,%xmm1
   1336 	mov	40(%rsp),%rsi		# restore %rsp
   1337 .cfi_def_cfa	%rsi,8
   1338 	jmp	.Lmulx4x_cond_copy
   1339 
   1340 .align	32
   1341 .Lmulx4x_cond_copy:
   1342 	movdqa	16*0($tptr),%xmm2
   1343 	movdqa	16*1($tptr),%xmm3
   1344 	lea	16*2($tptr),$tptr
   1345 	movdqu	16*0($rptr),%xmm4
   1346 	movdqu	16*1($rptr),%xmm5
   1347 	lea	16*2($rptr),$rptr
   1348 	movdqa	%xmm0,-16*2($tptr)	# zero tp
   1349 	movdqa	%xmm0,-16*1($tptr)
   1350 	pcmpeqd	%xmm1,%xmm0
   1351 	pand	%xmm1,%xmm2
   1352 	pand	%xmm1,%xmm3
   1353 	pand	%xmm0,%xmm4
   1354 	pand	%xmm0,%xmm5
   1355 	pxor	%xmm0,%xmm0
   1356 	por	%xmm2,%xmm4
   1357 	por	%xmm3,%xmm5
   1358 	movdqu	%xmm4,-16*2($rptr)
   1359 	movdqu	%xmm5,-16*1($rptr)
   1360 	sub	\$32,%rdx
   1361 	jnz	.Lmulx4x_cond_copy
   1362 
   1363 	mov	%rdx,($tptr)
   1364 
   1365 	mov	\$1,%rax
   1366 	mov	-48(%rsi),%r15
   1367 .cfi_restore	%r15
   1368 	mov	-40(%rsi),%r14
   1369 .cfi_restore	%r14
   1370 	mov	-32(%rsi),%r13
   1371 .cfi_restore	%r13
   1372 	mov	-24(%rsi),%r12
   1373 .cfi_restore	%r12
   1374 	mov	-16(%rsi),%rbp
   1375 .cfi_restore	%rbp
   1376 	mov	-8(%rsi),%rbx
   1377 .cfi_restore	%rbx
   1378 	lea	(%rsi),%rsp
   1379 .cfi_def_cfa_register	%rsp
   1380 .Lmulx4x_epilogue:
   1381 	ret
   1382 .cfi_endproc
   1383 .size	bn_mulx4x_mont,.-bn_mulx4x_mont
   1384 ___
   1385 }}}
   1386 $code.=<<___;
   1387 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1388 .align	16
   1389 ___
   1390 
   1391 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1392 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1393 if ($win64) {
   1394 $rec="%rcx";
   1395 $frame="%rdx";
   1396 $context="%r8";
   1397 $disp="%r9";
   1398 
   1399 $code.=<<___;
   1400 .extern	__imp_RtlVirtualUnwind
   1401 .type	mul_handler,\@abi-omnipotent
   1402 .align	16
   1403 mul_handler:
   1404 	push	%rsi
   1405 	push	%rdi
   1406 	push	%rbx
   1407 	push	%rbp
   1408 	push	%r12
   1409 	push	%r13
   1410 	push	%r14
   1411 	push	%r15
   1412 	pushfq
   1413 	sub	\$64,%rsp
   1414 
   1415 	mov	120($context),%rax	# pull context->Rax
   1416 	mov	248($context),%rbx	# pull context->Rip
   1417 
   1418 	mov	8($disp),%rsi		# disp->ImageBase
   1419 	mov	56($disp),%r11		# disp->HandlerData
   1420 
   1421 	mov	0(%r11),%r10d		# HandlerData[0]
   1422 	lea	(%rsi,%r10),%r10	# end of prologue label
   1423 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   1424 	jb	.Lcommon_seh_tail
   1425 
   1426 	mov	152($context),%rax	# pull context->Rsp
   1427 
   1428 	mov	4(%r11),%r10d		# HandlerData[1]
   1429 	lea	(%rsi,%r10),%r10	# epilogue label
   1430 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1431 	jae	.Lcommon_seh_tail
   1432 
   1433 	mov	192($context),%r10	# pull $num
   1434 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   1435 
   1436 	jmp	.Lcommon_pop_regs
   1437 .size	mul_handler,.-mul_handler
   1438 
   1439 .type	sqr_handler,\@abi-omnipotent
   1440 .align	16
   1441 sqr_handler:
   1442 	push	%rsi
   1443 	push	%rdi
   1444 	push	%rbx
   1445 	push	%rbp
   1446 	push	%r12
   1447 	push	%r13
   1448 	push	%r14
   1449 	push	%r15
   1450 	pushfq
   1451 	sub	\$64,%rsp
   1452 
   1453 	mov	120($context),%rax	# pull context->Rax
   1454 	mov	248($context),%rbx	# pull context->Rip
   1455 
   1456 	mov	8($disp),%rsi		# disp->ImageBase
   1457 	mov	56($disp),%r11		# disp->HandlerData
   1458 
   1459 	mov	0(%r11),%r10d		# HandlerData[0]
   1460 	lea	(%rsi,%r10),%r10	# end of prologue label
   1461 	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
   1462 	jb	.Lcommon_seh_tail
   1463 
   1464 	mov	4(%r11),%r10d		# HandlerData[1]
   1465 	lea	(%rsi,%r10),%r10	# body label
   1466 	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
   1467 	jb	.Lcommon_pop_regs
   1468 
   1469 	mov	152($context),%rax	# pull context->Rsp
   1470 
   1471 	mov	8(%r11),%r10d		# HandlerData[2]
   1472 	lea	(%rsi,%r10),%r10	# epilogue label
   1473 	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
   1474 	jae	.Lcommon_seh_tail
   1475 
   1476 	mov	40(%rax),%rax		# pull saved stack pointer
   1477 
   1478 .Lcommon_pop_regs:
   1479 	mov	-8(%rax),%rbx
   1480 	mov	-16(%rax),%rbp
   1481 	mov	-24(%rax),%r12
   1482 	mov	-32(%rax),%r13
   1483 	mov	-40(%rax),%r14
   1484 	mov	-48(%rax),%r15
   1485 	mov	%rbx,144($context)	# restore context->Rbx
   1486 	mov	%rbp,160($context)	# restore context->Rbp
   1487 	mov	%r12,216($context)	# restore context->R12
   1488 	mov	%r13,224($context)	# restore context->R13
   1489 	mov	%r14,232($context)	# restore context->R14
   1490 	mov	%r15,240($context)	# restore context->R15
   1491 
   1492 .Lcommon_seh_tail:
   1493 	mov	8(%rax),%rdi
   1494 	mov	16(%rax),%rsi
   1495 	mov	%rax,152($context)	# restore context->Rsp
   1496 	mov	%rsi,168($context)	# restore context->Rsi
   1497 	mov	%rdi,176($context)	# restore context->Rdi
   1498 
   1499 	mov	40($disp),%rdi		# disp->ContextRecord
   1500 	mov	$context,%rsi		# context
   1501 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1502 	.long	0xa548f3fc		# cld; rep movsq
   1503 
   1504 	mov	$disp,%rsi
   1505 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1506 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1507 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1508 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1509 	mov	40(%rsi),%r10		# disp->ContextRecord
   1510 	lea	56(%rsi),%r11		# &disp->HandlerData
   1511 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1512 	mov	%r10,32(%rsp)		# arg5
   1513 	mov	%r11,40(%rsp)		# arg6
   1514 	mov	%r12,48(%rsp)		# arg7
   1515 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1516 	call	*__imp_RtlVirtualUnwind(%rip)
   1517 
   1518 	mov	\$1,%eax		# ExceptionContinueSearch
   1519 	add	\$64,%rsp
   1520 	popfq
   1521 	pop	%r15
   1522 	pop	%r14
   1523 	pop	%r13
   1524 	pop	%r12
   1525 	pop	%rbp
   1526 	pop	%rbx
   1527 	pop	%rdi
   1528 	pop	%rsi
   1529 	ret
   1530 .size	sqr_handler,.-sqr_handler
   1531 
   1532 .section	.pdata
   1533 .align	4
   1534 	.rva	.LSEH_begin_bn_mul_mont
   1535 	.rva	.LSEH_end_bn_mul_mont
   1536 	.rva	.LSEH_info_bn_mul_mont
   1537 
   1538 	.rva	.LSEH_begin_bn_mul4x_mont
   1539 	.rva	.LSEH_end_bn_mul4x_mont
   1540 	.rva	.LSEH_info_bn_mul4x_mont
   1541 
   1542 	.rva	.LSEH_begin_bn_sqr8x_mont
   1543 	.rva	.LSEH_end_bn_sqr8x_mont
   1544 	.rva	.LSEH_info_bn_sqr8x_mont
   1545 ___
   1546 $code.=<<___ if ($addx);
   1547 	.rva	.LSEH_begin_bn_mulx4x_mont
   1548 	.rva	.LSEH_end_bn_mulx4x_mont
   1549 	.rva	.LSEH_info_bn_mulx4x_mont
   1550 ___
   1551 $code.=<<___;
   1552 .section	.xdata
   1553 .align	8
   1554 .LSEH_info_bn_mul_mont:
   1555 	.byte	9,0,0,0
   1556 	.rva	mul_handler
   1557 	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
   1558 .LSEH_info_bn_mul4x_mont:
   1559 	.byte	9,0,0,0
   1560 	.rva	mul_handler
   1561 	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1562 .LSEH_info_bn_sqr8x_mont:
   1563 	.byte	9,0,0,0
   1564 	.rva	sqr_handler
   1565 	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
   1566 .align	8
   1567 ___
   1568 $code.=<<___ if ($addx);
   1569 .LSEH_info_bn_mulx4x_mont:
   1570 	.byte	9,0,0,0
   1571 	.rva	sqr_handler
   1572 	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
   1573 .align	8
   1574 ___
   1575 }
   1576 
   1577 print $code;
   1578 close STDOUT;
   1579