Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # August 2011.
     11 #
     12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
     13 # countermeasures. The subroutines are produced by replacing bp[i]
     14 # references in their x86_64-mont.pl counterparts with cache-neutral
     15 # references to powers table computed in BN_mod_exp_mont_consttime.
     16 # In addition subroutine that scatters elements of the powers table
     17 # is implemented, so that scatter-/gathering can be tuned without
     18 # bn_exp.c modifications.
     19 
     20 # August 2013.
     21 #
     22 # Add MULX/AD*X code paths and additional interfaces to optimize for
     23 # branch prediction unit. For input lengths that are multiples of 8
     24 # the np argument is not just modulus value, but one interleaved
     25 # with 0. This is to optimize post-condition...
     26 
     27 $flavour = shift;
     28 $output  = shift;
     29 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     30 
     31 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     32 
     33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     34 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     35 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     36 die "can't locate x86_64-xlate.pl";
     37 
     38 open OUT,"| \"$^X\" $xlate $flavour $output";
     39 *STDOUT=*OUT;
     40 
     41 # In upstream, this is controlled by shelling out to the compiler to check
     42 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     43 # output, so this isn't useful anyway.
     44 #
     45 # TODO(davidben): Enable this after testing. $addx goes up to 1.
     46 $addx = 0;
     47 
     48 # int bn_mul_mont_gather5(
     49 $rp="%rdi";	# BN_ULONG *rp,
     50 $ap="%rsi";	# const BN_ULONG *ap,
     51 $bp="%rdx";	# const BN_ULONG *bp,
     52 $np="%rcx";	# const BN_ULONG *np,
     53 $n0="%r8";	# const BN_ULONG *n0,
     54 $num="%r9";	# int num,
     55 		# int idx);	# 0 to 2^5-1, "index" in $bp holding
     56 				# pre-computed powers of a', interlaced
     57 				# in such manner that b[0] is $bp[idx],
     58 				# b[1] is [2^5+idx], etc.
     59 $lo0="%r10";
     60 $hi0="%r11";
     61 $hi1="%r13";
     62 $i="%r14";
     63 $j="%r15";
     64 $m0="%rbx";
     65 $m1="%rbp";
     66 
     67 $code=<<___;
     68 .text
     69 
     70 .extern	OPENSSL_ia32cap_P
     71 
     72 .globl	bn_mul_mont_gather5
     73 .type	bn_mul_mont_gather5,\@function,6
     74 .align	64
     75 bn_mul_mont_gather5:
     76 	test	\$7,${num}d
     77 	jnz	.Lmul_enter
     78 ___
     79 $code.=<<___ if ($addx);
     80 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
     81 ___
     82 $code.=<<___;
     83 	jmp	.Lmul4x_enter
     84 
     85 .align	16
     86 .Lmul_enter:
     87 	mov	${num}d,${num}d
     88 	mov	%rsp,%rax
     89 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
     90 	push	%rbx
     91 	push	%rbp
     92 	push	%r12
     93 	push	%r13
     94 	push	%r14
     95 	push	%r15
     96 ___
     97 $code.=<<___ if ($win64);
     98 	lea	-0x28(%rsp),%rsp
     99 	movaps	%xmm6,(%rsp)
    100 	movaps	%xmm7,0x10(%rsp)
    101 ___
    102 $code.=<<___;
    103 	lea	2($num),%r11
    104 	neg	%r11
    105 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
    106 	and	\$-1024,%rsp		# minimize TLB usage
    107 
    108 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    109 .Lmul_body:
    110 	mov	$bp,%r12		# reassign $bp
    111 ___
    112 		$bp="%r12";
    113 		$STRIDE=2**5*8;		# 5 is "window size"
    114 		$N=$STRIDE/4;		# should match cache line size
    115 $code.=<<___;
    116 	mov	%r10,%r11
    117 	shr	\$`log($N/8)/log(2)`,%r10
    118 	and	\$`$N/8-1`,%r11
    119 	not	%r10
    120 	lea	.Lmagic_masks(%rip),%rax
    121 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    122 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    123 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    124 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    125 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    126 	movq	24(%rax,%r10,8),%xmm7
    127 
    128 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    129 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    130 	pand	%xmm4,%xmm0
    131 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    132 	pand	%xmm5,%xmm1
    133 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    134 	pand	%xmm6,%xmm2
    135 	por	%xmm1,%xmm0
    136 	pand	%xmm7,%xmm3
    137 	por	%xmm2,%xmm0
    138 	lea	$STRIDE($bp),$bp
    139 	por	%xmm3,%xmm0
    140 
    141 	movq	%xmm0,$m0		# m0=bp[0]
    142 
    143 	mov	($n0),$n0		# pull n0[0] value
    144 	mov	($ap),%rax
    145 
    146 	xor	$i,$i			# i=0
    147 	xor	$j,$j			# j=0
    148 
    149 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    150 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    151 	pand	%xmm4,%xmm0
    152 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    153 	pand	%xmm5,%xmm1
    154 
    155 	mov	$n0,$m1
    156 	mulq	$m0			# ap[0]*bp[0]
    157 	mov	%rax,$lo0
    158 	mov	($np),%rax
    159 
    160 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    161 	pand	%xmm6,%xmm2
    162 	por	%xmm1,%xmm0
    163 	pand	%xmm7,%xmm3
    164 
    165 	imulq	$lo0,$m1		# "tp[0]"*n0
    166 	mov	%rdx,$hi0
    167 
    168 	por	%xmm2,%xmm0
    169 	lea	$STRIDE($bp),$bp
    170 	por	%xmm3,%xmm0
    171 
    172 	mulq	$m1			# np[0]*m1
    173 	add	%rax,$lo0		# discarded
    174 	mov	8($ap),%rax
    175 	adc	\$0,%rdx
    176 	mov	%rdx,$hi1
    177 
    178 	lea	1($j),$j		# j++
    179 	jmp	.L1st_enter
    180 
    181 .align	16
    182 .L1st:
    183 	add	%rax,$hi1
    184 	mov	($ap,$j,8),%rax
    185 	adc	\$0,%rdx
    186 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    187 	mov	$lo0,$hi0
    188 	adc	\$0,%rdx
    189 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    190 	mov	%rdx,$hi1
    191 
    192 .L1st_enter:
    193 	mulq	$m0			# ap[j]*bp[0]
    194 	add	%rax,$hi0
    195 	mov	($np,$j,8),%rax
    196 	adc	\$0,%rdx
    197 	lea	1($j),$j		# j++
    198 	mov	%rdx,$lo0
    199 
    200 	mulq	$m1			# np[j]*m1
    201 	cmp	$num,$j
    202 	jne	.L1st
    203 
    204 	movq	%xmm0,$m0		# bp[1]
    205 
    206 	add	%rax,$hi1
    207 	mov	($ap),%rax		# ap[0]
    208 	adc	\$0,%rdx
    209 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    210 	adc	\$0,%rdx
    211 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    212 	mov	%rdx,$hi1
    213 	mov	$lo0,$hi0
    214 
    215 	xor	%rdx,%rdx
    216 	add	$hi0,$hi1
    217 	adc	\$0,%rdx
    218 	mov	$hi1,-8(%rsp,$num,8)
    219 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    220 
    221 	lea	1($i),$i		# i++
    222 	jmp	.Louter
    223 .align	16
    224 .Louter:
    225 	xor	$j,$j			# j=0
    226 	mov	$n0,$m1
    227 	mov	(%rsp),$lo0
    228 
    229 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    230 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    231 	pand	%xmm4,%xmm0
    232 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    233 	pand	%xmm5,%xmm1
    234 
    235 	mulq	$m0			# ap[0]*bp[i]
    236 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    237 	mov	($np),%rax
    238 	adc	\$0,%rdx
    239 
    240 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    241 	pand	%xmm6,%xmm2
    242 	por	%xmm1,%xmm0
    243 	pand	%xmm7,%xmm3
    244 
    245 	imulq	$lo0,$m1		# tp[0]*n0
    246 	mov	%rdx,$hi0
    247 
    248 	por	%xmm2,%xmm0
    249 	lea	$STRIDE($bp),$bp
    250 	por	%xmm3,%xmm0
    251 
    252 	mulq	$m1			# np[0]*m1
    253 	add	%rax,$lo0		# discarded
    254 	mov	8($ap),%rax
    255 	adc	\$0,%rdx
    256 	mov	8(%rsp),$lo0		# tp[1]
    257 	mov	%rdx,$hi1
    258 
    259 	lea	1($j),$j		# j++
    260 	jmp	.Linner_enter
    261 
    262 .align	16
    263 .Linner:
    264 	add	%rax,$hi1
    265 	mov	($ap,$j,8),%rax
    266 	adc	\$0,%rdx
    267 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    268 	mov	(%rsp,$j,8),$lo0
    269 	adc	\$0,%rdx
    270 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    271 	mov	%rdx,$hi1
    272 
    273 .Linner_enter:
    274 	mulq	$m0			# ap[j]*bp[i]
    275 	add	%rax,$hi0
    276 	mov	($np,$j,8),%rax
    277 	adc	\$0,%rdx
    278 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    279 	mov	%rdx,$hi0
    280 	adc	\$0,$hi0
    281 	lea	1($j),$j		# j++
    282 
    283 	mulq	$m1			# np[j]*m1
    284 	cmp	$num,$j
    285 	jne	.Linner
    286 
    287 	movq	%xmm0,$m0		# bp[i+1]
    288 
    289 	add	%rax,$hi1
    290 	mov	($ap),%rax		# ap[0]
    291 	adc	\$0,%rdx
    292 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    293 	mov	(%rsp,$j,8),$lo0
    294 	adc	\$0,%rdx
    295 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    296 	mov	%rdx,$hi1
    297 
    298 	xor	%rdx,%rdx
    299 	add	$hi0,$hi1
    300 	adc	\$0,%rdx
    301 	add	$lo0,$hi1		# pull upmost overflow bit
    302 	adc	\$0,%rdx
    303 	mov	$hi1,-8(%rsp,$num,8)
    304 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    305 
    306 	lea	1($i),$i		# i++
    307 	cmp	$num,$i
    308 	jb	.Louter
    309 
    310 	xor	$i,$i			# i=0 and clear CF!
    311 	mov	(%rsp),%rax		# tp[0]
    312 	lea	(%rsp),$ap		# borrow ap for tp
    313 	mov	$num,$j			# j=num
    314 	jmp	.Lsub
    315 .align	16
    316 .Lsub:	sbb	($np,$i,8),%rax
    317 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    318 	mov	8($ap,$i,8),%rax	# tp[i+1]
    319 	lea	1($i),$i		# i++
    320 	dec	$j			# doesnn't affect CF!
    321 	jnz	.Lsub
    322 
    323 	sbb	\$0,%rax		# handle upmost overflow bit
    324 	xor	$i,$i
    325 	mov	$num,$j			# j=num
    326 .align	16
    327 .Lcopy:					# copy or in-place refresh
    328 	mov	(%rsp,$i,8),$ap
    329 	mov	($rp,$i,8),$np
    330 	xor	$np,$ap			# conditional select:
    331 	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
    332 	xor	$np,$ap			# ap = borrow?tp:rp
    333 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    334 	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
    335 	lea	1($i),$i
    336 	sub	\$1,$j
    337 	jnz	.Lcopy
    338 
    339 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    340 	mov	\$1,%rax
    341 ___
    342 $code.=<<___ if ($win64);
    343 	movaps	-88(%rsi),%xmm6
    344 	movaps	-72(%rsi),%xmm7
    345 ___
    346 $code.=<<___;
    347 	mov	-48(%rsi),%r15
    348 	mov	-40(%rsi),%r14
    349 	mov	-32(%rsi),%r13
    350 	mov	-24(%rsi),%r12
    351 	mov	-16(%rsi),%rbp
    352 	mov	-8(%rsi),%rbx
    353 	lea	(%rsi),%rsp
    354 .Lmul_epilogue:
    355 	ret
    356 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    357 ___
    358 {{{
    359 my @A=("%r10","%r11");
    360 my @N=("%r13","%rdi");
    361 $code.=<<___;
    362 .type	bn_mul4x_mont_gather5,\@function,6
    363 .align	32
    364 bn_mul4x_mont_gather5:
    365 .Lmul4x_enter:
    366 ___
    367 $code.=<<___ if ($addx);
    368 	and	\$0x80100,%r11d
    369 	cmp	\$0x80100,%r11d
    370 	je	.Lmulx4x_enter
    371 ___
    372 $code.=<<___;
    373 	.byte	0x67
    374 	mov	%rsp,%rax
    375 	push	%rbx
    376 	push	%rbp
    377 	push	%r12
    378 	push	%r13
    379 	push	%r14
    380 	push	%r15
    381 ___
    382 $code.=<<___ if ($win64);
    383 	lea	-0x28(%rsp),%rsp
    384 	movaps	%xmm6,(%rsp)
    385 	movaps	%xmm7,0x10(%rsp)
    386 ___
    387 $code.=<<___;
    388 	.byte	0x67
    389 	mov	${num}d,%r10d
    390 	shl	\$3,${num}d
    391 	shl	\$3+2,%r10d		# 4*$num
    392 	neg	$num			# -$num
    393 
    394 	##############################################################
    395 	# ensure that stack frame doesn't alias with $aptr+4*$num
    396 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
    397 	# (see bn_exp.c). this is done to allow memory disambiguation
    398 	# logic do its magic. [excessive frame is allocated in order
    399 	# to allow bn_from_mont8x to clear it.]
    400 	#
    401 	lea	-64(%rsp,$num,2),%r11
    402 	sub	$ap,%r11
    403 	and	\$4095,%r11
    404 	cmp	%r11,%r10
    405 	jb	.Lmul4xsp_alt
    406 	sub	%r11,%rsp		# align with $ap
    407 	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
    408 	jmp	.Lmul4xsp_done
    409 
    410 .align	32
    411 .Lmul4xsp_alt:
    412 	lea	4096-64(,$num,2),%r10
    413 	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
    414 	sub	%r10,%r11
    415 	mov	\$0,%r10
    416 	cmovc	%r10,%r11
    417 	sub	%r11,%rsp
    418 .Lmul4xsp_done:
    419 	and	\$-64,%rsp
    420 	neg	$num
    421 
    422 	mov	%rax,40(%rsp)
    423 .Lmul4x_body:
    424 
    425 	call	mul4x_internal
    426 
    427 	mov	40(%rsp),%rsi		# restore %rsp
    428 	mov	\$1,%rax
    429 ___
    430 $code.=<<___ if ($win64);
    431 	movaps	-88(%rsi),%xmm6
    432 	movaps	-72(%rsi),%xmm7
    433 ___
    434 $code.=<<___;
    435 	mov	-48(%rsi),%r15
    436 	mov	-40(%rsi),%r14
    437 	mov	-32(%rsi),%r13
    438 	mov	-24(%rsi),%r12
    439 	mov	-16(%rsi),%rbp
    440 	mov	-8(%rsi),%rbx
    441 	lea	(%rsi),%rsp
    442 .Lmul4x_epilogue:
    443 	ret
    444 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    445 
    446 .type	mul4x_internal,\@abi-omnipotent
    447 .align	32
    448 mul4x_internal:
    449 	shl	\$5,$num
    450 	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
    451 	lea	256(%rdx,$num),%r13
    452 	shr	\$5,$num		# restore $num
    453 ___
    454 		$bp="%r12";
    455 		$STRIDE=2**5*8;		# 5 is "window size"
    456 		$N=$STRIDE/4;		# should match cache line size
    457 		$tp=$i;
    458 $code.=<<___;
    459 	mov	%r10,%r11
    460 	shr	\$`log($N/8)/log(2)`,%r10
    461 	and	\$`$N/8-1`,%r11
    462 	not	%r10
    463 	lea	.Lmagic_masks(%rip),%rax
    464 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    465 	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
    466 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    467 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    468 	add	\$7,%r11
    469 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    470 	movq	24(%rax,%r10,8),%xmm7
    471 	and	\$7,%r11
    472 
    473 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    474 	lea	$STRIDE($bp),$tp	# borrow $tp
    475 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    476 	pand	%xmm4,%xmm0
    477 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    478 	pand	%xmm5,%xmm1
    479 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    480 	pand	%xmm6,%xmm2
    481 	.byte	0x67
    482 	por	%xmm1,%xmm0
    483 	movq	`0*$STRIDE/4-96`($tp),%xmm1
    484 	.byte	0x67
    485 	pand	%xmm7,%xmm3
    486 	.byte	0x67
    487 	por	%xmm2,%xmm0
    488 	movq	`1*$STRIDE/4-96`($tp),%xmm2
    489 	.byte	0x67
    490 	pand	%xmm4,%xmm1
    491 	.byte	0x67
    492 	por	%xmm3,%xmm0
    493 	movq	`2*$STRIDE/4-96`($tp),%xmm3
    494 
    495 	movq	%xmm0,$m0		# m0=bp[0]
    496 	movq	`3*$STRIDE/4-96`($tp),%xmm0
    497 	mov	%r13,16+8(%rsp)		# save end of b[num]
    498 	mov	$rp, 56+8(%rsp)		# save $rp
    499 
    500 	mov	($n0),$n0		# pull n0[0] value
    501 	mov	($ap),%rax
    502 	lea	($ap,$num),$ap		# end of a[num]
    503 	neg	$num
    504 
    505 	mov	$n0,$m1
    506 	mulq	$m0			# ap[0]*bp[0]
    507 	mov	%rax,$A[0]
    508 	mov	($np),%rax
    509 
    510 	pand	%xmm5,%xmm2
    511 	pand	%xmm6,%xmm3
    512 	por	%xmm2,%xmm1
    513 
    514 	imulq	$A[0],$m1		# "tp[0]"*n0
    515 	##############################################################
    516 	# $tp is chosen so that writing to top-most element of the
    517 	# vector occurs just "above" references to powers table,
    518 	# "above" modulo cache-line size, which effectively precludes
    519 	# possibility of memory disambiguation logic failure when
    520 	# accessing the table.
    521 	# 
    522 	lea	64+8(%rsp,%r11,8),$tp
    523 	mov	%rdx,$A[1]
    524 
    525 	pand	%xmm7,%xmm0
    526 	por	%xmm3,%xmm1
    527 	lea	2*$STRIDE($bp),$bp
    528 	por	%xmm1,%xmm0
    529 
    530 	mulq	$m1			# np[0]*m1
    531 	add	%rax,$A[0]		# discarded
    532 	mov	8($ap,$num),%rax
    533 	adc	\$0,%rdx
    534 	mov	%rdx,$N[1]
    535 
    536 	mulq	$m0
    537 	add	%rax,$A[1]
    538 	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
    539 	adc	\$0,%rdx
    540 	mov	%rdx,$A[0]
    541 
    542 	mulq	$m1
    543 	add	%rax,$N[1]
    544 	mov	16($ap,$num),%rax
    545 	adc	\$0,%rdx
    546 	add	$A[1],$N[1]
    547 	lea	4*8($num),$j		# j=4
    548 	lea	16*4($np),$np
    549 	adc	\$0,%rdx
    550 	mov	$N[1],($tp)
    551 	mov	%rdx,$N[0]
    552 	jmp	.L1st4x
    553 
    554 .align	32
    555 .L1st4x:
    556 	mulq	$m0			# ap[j]*bp[0]
    557 	add	%rax,$A[0]
    558 	mov	-16*2($np),%rax
    559 	lea	32($tp),$tp
    560 	adc	\$0,%rdx
    561 	mov	%rdx,$A[1]
    562 
    563 	mulq	$m1			# np[j]*m1
    564 	add	%rax,$N[0]
    565 	mov	-8($ap,$j),%rax
    566 	adc	\$0,%rdx
    567 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    568 	adc	\$0,%rdx
    569 	mov	$N[0],-24($tp)		# tp[j-1]
    570 	mov	%rdx,$N[1]
    571 
    572 	mulq	$m0			# ap[j]*bp[0]
    573 	add	%rax,$A[1]
    574 	mov	-16*1($np),%rax
    575 	adc	\$0,%rdx
    576 	mov	%rdx,$A[0]
    577 
    578 	mulq	$m1			# np[j]*m1
    579 	add	%rax,$N[1]
    580 	mov	($ap,$j),%rax
    581 	adc	\$0,%rdx
    582 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    583 	adc	\$0,%rdx
    584 	mov	$N[1],-16($tp)		# tp[j-1]
    585 	mov	%rdx,$N[0]
    586 
    587 	mulq	$m0			# ap[j]*bp[0]
    588 	add	%rax,$A[0]
    589 	mov	16*0($np),%rax
    590 	adc	\$0,%rdx
    591 	mov	%rdx,$A[1]
    592 
    593 	mulq	$m1			# np[j]*m1
    594 	add	%rax,$N[0]
    595 	mov	8($ap,$j),%rax
    596 	adc	\$0,%rdx
    597 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    598 	adc	\$0,%rdx
    599 	mov	$N[0],-8($tp)		# tp[j-1]
    600 	mov	%rdx,$N[1]
    601 
    602 	mulq	$m0			# ap[j]*bp[0]
    603 	add	%rax,$A[1]
    604 	mov	16*1($np),%rax
    605 	adc	\$0,%rdx
    606 	mov	%rdx,$A[0]
    607 
    608 	mulq	$m1			# np[j]*m1
    609 	add	%rax,$N[1]
    610 	mov	16($ap,$j),%rax
    611 	adc	\$0,%rdx
    612 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    613 	lea	16*4($np),$np
    614 	adc	\$0,%rdx
    615 	mov	$N[1],($tp)		# tp[j-1]
    616 	mov	%rdx,$N[0]
    617 
    618 	add	\$32,$j			# j+=4
    619 	jnz	.L1st4x
    620 
    621 	mulq	$m0			# ap[j]*bp[0]
    622 	add	%rax,$A[0]
    623 	mov	-16*2($np),%rax
    624 	lea	32($tp),$tp
    625 	adc	\$0,%rdx
    626 	mov	%rdx,$A[1]
    627 
    628 	mulq	$m1			# np[j]*m1
    629 	add	%rax,$N[0]
    630 	mov	-8($ap),%rax
    631 	adc	\$0,%rdx
    632 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    633 	adc	\$0,%rdx
    634 	mov	$N[0],-24($tp)		# tp[j-1]
    635 	mov	%rdx,$N[1]
    636 
    637 	mulq	$m0			# ap[j]*bp[0]
    638 	add	%rax,$A[1]
    639 	mov	-16*1($np),%rax
    640 	adc	\$0,%rdx
    641 	mov	%rdx,$A[0]
    642 
    643 	mulq	$m1			# np[j]*m1
    644 	add	%rax,$N[1]
    645 	mov	($ap,$num),%rax		# ap[0]
    646 	adc	\$0,%rdx
    647 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    648 	adc	\$0,%rdx
    649 	mov	$N[1],-16($tp)		# tp[j-1]
    650 	mov	%rdx,$N[0]
    651 
    652 	movq	%xmm0,$m0		# bp[1]
    653 	lea	($np,$num,2),$np	# rewind $np
    654 
    655 	xor	$N[1],$N[1]
    656 	add	$A[0],$N[0]
    657 	adc	\$0,$N[1]
    658 	mov	$N[0],-8($tp)
    659 
    660 	jmp	.Louter4x
    661 
    662 .align	32
    663 .Louter4x:
    664 	mov	($tp,$num),$A[0]
    665 	mov	$n0,$m1
    666 	mulq	$m0			# ap[0]*bp[i]
    667 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    668 	mov	($np),%rax
    669 	adc	\$0,%rdx
    670 
    671 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    672 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    673 	pand	%xmm4,%xmm0
    674 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    675 	pand	%xmm5,%xmm1
    676 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    677 
    678 	imulq	$A[0],$m1		# tp[0]*n0
    679 	.byte	0x67
    680 	mov	%rdx,$A[1]
    681 	mov	$N[1],($tp)		# store upmost overflow bit
    682 
    683 	pand	%xmm6,%xmm2
    684 	por	%xmm1,%xmm0
    685 	pand	%xmm7,%xmm3
    686 	por	%xmm2,%xmm0
    687 	lea	($tp,$num),$tp		# rewind $tp
    688 	lea	$STRIDE($bp),$bp
    689 	por	%xmm3,%xmm0
    690 
    691 	mulq	$m1			# np[0]*m1
    692 	add	%rax,$A[0]		# "$N[0]", discarded
    693 	mov	8($ap,$num),%rax
    694 	adc	\$0,%rdx
    695 	mov	%rdx,$N[1]
    696 
    697 	mulq	$m0			# ap[j]*bp[i]
    698 	add	%rax,$A[1]
    699 	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
    700 	adc	\$0,%rdx
    701 	add	8($tp),$A[1]		# +tp[1]
    702 	adc	\$0,%rdx
    703 	mov	%rdx,$A[0]
    704 
    705 	mulq	$m1			# np[j]*m1
    706 	add	%rax,$N[1]
    707 	mov	16($ap,$num),%rax
    708 	adc	\$0,%rdx
    709 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    710 	lea	4*8($num),$j		# j=4
    711 	lea	16*4($np),$np
    712 	adc	\$0,%rdx
    713 	mov	%rdx,$N[0]
    714 	jmp	.Linner4x
    715 
    716 .align	32
    717 .Linner4x:
    718 	mulq	$m0			# ap[j]*bp[i]
    719 	add	%rax,$A[0]
    720 	mov	-16*2($np),%rax
    721 	adc	\$0,%rdx
    722 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    723 	lea	32($tp),$tp
    724 	adc	\$0,%rdx
    725 	mov	%rdx,$A[1]
    726 
    727 	mulq	$m1			# np[j]*m1
    728 	add	%rax,$N[0]
    729 	mov	-8($ap,$j),%rax
    730 	adc	\$0,%rdx
    731 	add	$A[0],$N[0]
    732 	adc	\$0,%rdx
    733 	mov	$N[1],-32($tp)		# tp[j-1]
    734 	mov	%rdx,$N[1]
    735 
    736 	mulq	$m0			# ap[j]*bp[i]
    737 	add	%rax,$A[1]
    738 	mov	-16*1($np),%rax
    739 	adc	\$0,%rdx
    740 	add	-8($tp),$A[1]
    741 	adc	\$0,%rdx
    742 	mov	%rdx,$A[0]
    743 
    744 	mulq	$m1			# np[j]*m1
    745 	add	%rax,$N[1]
    746 	mov	($ap,$j),%rax
    747 	adc	\$0,%rdx
    748 	add	$A[1],$N[1]
    749 	adc	\$0,%rdx
    750 	mov	$N[0],-24($tp)		# tp[j-1]
    751 	mov	%rdx,$N[0]
    752 
    753 	mulq	$m0			# ap[j]*bp[i]
    754 	add	%rax,$A[0]
    755 	mov	16*0($np),%rax
    756 	adc	\$0,%rdx
    757 	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    758 	adc	\$0,%rdx
    759 	mov	%rdx,$A[1]
    760 
    761 	mulq	$m1			# np[j]*m1
    762 	add	%rax,$N[0]
    763 	mov	8($ap,$j),%rax
    764 	adc	\$0,%rdx
    765 	add	$A[0],$N[0]
    766 	adc	\$0,%rdx
    767 	mov	$N[1],-16($tp)		# tp[j-1]
    768 	mov	%rdx,$N[1]
    769 
    770 	mulq	$m0			# ap[j]*bp[i]
    771 	add	%rax,$A[1]
    772 	mov	16*1($np),%rax
    773 	adc	\$0,%rdx
    774 	add	8($tp),$A[1]
    775 	adc	\$0,%rdx
    776 	mov	%rdx,$A[0]
    777 
    778 	mulq	$m1			# np[j]*m1
    779 	add	%rax,$N[1]
    780 	mov	16($ap,$j),%rax
    781 	adc	\$0,%rdx
    782 	add	$A[1],$N[1]
    783 	lea	16*4($np),$np
    784 	adc	\$0,%rdx
    785 	mov	$N[0],-8($tp)		# tp[j-1]
    786 	mov	%rdx,$N[0]
    787 
    788 	add	\$32,$j			# j+=4
    789 	jnz	.Linner4x
    790 
    791 	mulq	$m0			# ap[j]*bp[i]
    792 	add	%rax,$A[0]
    793 	mov	-16*2($np),%rax
    794 	adc	\$0,%rdx
    795 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    796 	lea	32($tp),$tp
    797 	adc	\$0,%rdx
    798 	mov	%rdx,$A[1]
    799 
    800 	mulq	$m1			# np[j]*m1
    801 	add	%rax,$N[0]
    802 	mov	-8($ap),%rax
    803 	adc	\$0,%rdx
    804 	add	$A[0],$N[0]
    805 	adc	\$0,%rdx
    806 	mov	$N[1],-32($tp)		# tp[j-1]
    807 	mov	%rdx,$N[1]
    808 
    809 	mulq	$m0			# ap[j]*bp[i]
    810 	add	%rax,$A[1]
    811 	mov	$m1,%rax
    812 	mov	-16*1($np),$m1
    813 	adc	\$0,%rdx
    814 	add	-8($tp),$A[1]
    815 	adc	\$0,%rdx
    816 	mov	%rdx,$A[0]
    817 
    818 	mulq	$m1			# np[j]*m1
    819 	add	%rax,$N[1]
    820 	mov	($ap,$num),%rax		# ap[0]
    821 	adc	\$0,%rdx
    822 	add	$A[1],$N[1]
    823 	adc	\$0,%rdx
    824 	mov	$N[0],-24($tp)		# tp[j-1]
    825 	mov	%rdx,$N[0]
    826 
    827 	movq	%xmm0,$m0		# bp[i+1]
    828 	mov	$N[1],-16($tp)		# tp[j-1]
    829 	lea	($np,$num,2),$np	# rewind $np
    830 
    831 	xor	$N[1],$N[1]
    832 	add	$A[0],$N[0]
    833 	adc	\$0,$N[1]
    834 	add	($tp),$N[0]		# pull upmost overflow bit
    835 	adc	\$0,$N[1]		# upmost overflow bit
    836 	mov	$N[0],-8($tp)
    837 
    838 	cmp	16+8(%rsp),$bp
    839 	jb	.Louter4x
    840 ___
    841 if (1) {
    842 $code.=<<___;
    843 	sub	$N[0],$m1		# compare top-most words
    844 	adc	$j,$j			# $j is zero
    845 	or	$j,$N[1]
    846 	xor	\$1,$N[1]
    847 	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
    848 	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
    849 	mov	%r9,%rcx
    850 	sar	\$3+2,%rcx		# cf=0
    851 	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
    852 	jmp	.Lsqr4x_sub
    853 ___
    854 } else {
    855 my @ri=("%rax",$bp,$m0,$m1);
    856 my $rp="%rdx";
    857 $code.=<<___
    858 	xor	\$1,$N[1]
    859 	lea	($tp,$num),$tp		# rewind $tp
    860 	sar	\$5,$num		# cf=0
    861 	lea	($np,$N[1],8),$np
    862 	mov	56+8(%rsp),$rp		# restore $rp
    863 	jmp	.Lsub4x
    864 
    865 .align	32
    866 .Lsub4x:
    867 	.byte	0x66
    868 	mov	8*0($tp),@ri[0]
    869 	mov	8*1($tp),@ri[1]
    870 	.byte	0x66
    871 	sbb	16*0($np),@ri[0]
    872 	mov	8*2($tp),@ri[2]
    873 	sbb	16*1($np),@ri[1]
    874 	mov	3*8($tp),@ri[3]
    875 	lea	4*8($tp),$tp
    876 	sbb	16*2($np),@ri[2]
    877 	mov	@ri[0],8*0($rp)
    878 	sbb	16*3($np),@ri[3]
    879 	lea	16*4($np),$np
    880 	mov	@ri[1],8*1($rp)
    881 	mov	@ri[2],8*2($rp)
    882 	mov	@ri[3],8*3($rp)
    883 	lea	8*4($rp),$rp
    884 
    885 	inc	$num
    886 	jnz	.Lsub4x
    887 
    888 	ret
    889 ___
    890 }
    891 $code.=<<___;
    892 .size	mul4x_internal,.-mul4x_internal
    893 ___
    894 }}}
    895 {{{
    897 ######################################################################
    898 # void bn_power5(
    899 my $rptr="%rdi";	# BN_ULONG *rptr,
    900 my $aptr="%rsi";	# const BN_ULONG *aptr,
    901 my $bptr="%rdx";	# const void *table,
    902 my $nptr="%rcx";	# const BN_ULONG *nptr,
    903 my $n0  ="%r8";		# const BN_ULONG *n0);
    904 my $num ="%r9";		# int num, has to be divisible by 8
    905 			# int pwr 
    906 
    907 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    908 my @A0=("%r10","%r11");
    909 my @A1=("%r12","%r13");
    910 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    911 
    912 $code.=<<___;
    913 .globl	bn_power5
    914 .type	bn_power5,\@function,6
    915 .align	32
    916 bn_power5:
    917 ___
    918 $code.=<<___ if ($addx);
    919 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
    920 	and	\$0x80100,%r11d
    921 	cmp	\$0x80100,%r11d
    922 	je	.Lpowerx5_enter
    923 ___
    924 $code.=<<___;
    925 	mov	%rsp,%rax
    926 	push	%rbx
    927 	push	%rbp
    928 	push	%r12
    929 	push	%r13
    930 	push	%r14
    931 	push	%r15
    932 ___
    933 $code.=<<___ if ($win64);
    934 	lea	-0x28(%rsp),%rsp
    935 	movaps	%xmm6,(%rsp)
    936 	movaps	%xmm7,0x10(%rsp)
    937 ___
    938 $code.=<<___;
    939 	mov	${num}d,%r10d
    940 	shl	\$3,${num}d		# convert $num to bytes
    941 	shl	\$3+2,%r10d		# 4*$num
    942 	neg	$num
    943 	mov	($n0),$n0		# *n0
    944 
    945 	##############################################################
    946 	# ensure that stack frame doesn't alias with $aptr+4*$num
    947 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
    948 	# (see bn_exp.c). this is done to allow memory disambiguation
    949 	# logic do its magic.
    950 	#
    951 	lea	-64(%rsp,$num,2),%r11
    952 	sub	$aptr,%r11
    953 	and	\$4095,%r11
    954 	cmp	%r11,%r10
    955 	jb	.Lpwr_sp_alt
    956 	sub	%r11,%rsp		# align with $aptr
    957 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
    958 	jmp	.Lpwr_sp_done
    959 
    960 .align	32
    961 .Lpwr_sp_alt:
    962 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
    963 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
    964 	sub	%r10,%r11
    965 	mov	\$0,%r10
    966 	cmovc	%r10,%r11
    967 	sub	%r11,%rsp
    968 .Lpwr_sp_done:
    969 	and	\$-64,%rsp
    970 	mov	$num,%r10	
    971 	neg	$num
    972 
    973 	##############################################################
    974 	# Stack layout
    975 	#
    976 	# +0	saved $num, used in reduction section
    977 	# +8	&t[2*$num], used in reduction section
    978 	# +32	saved *n0
    979 	# +40	saved %rsp
    980 	# +48	t[2*$num]
    981 	#
    982 	mov	$n0,  32(%rsp)
    983 	mov	%rax, 40(%rsp)		# save original %rsp
    984 .Lpower5_body:
    985 	movq	$rptr,%xmm1		# save $rptr
    986 	movq	$nptr,%xmm2		# save $nptr
    987 	movq	%r10, %xmm3		# -$num
    988 	movq	$bptr,%xmm4
    989 
    990 	call	__bn_sqr8x_internal
    991 	call	__bn_sqr8x_internal
    992 	call	__bn_sqr8x_internal
    993 	call	__bn_sqr8x_internal
    994 	call	__bn_sqr8x_internal
    995 
    996 	movq	%xmm2,$nptr
    997 	movq	%xmm4,$bptr
    998 	mov	$aptr,$rptr
    999 	mov	40(%rsp),%rax
   1000 	lea	32(%rsp),$n0
   1001 
   1002 	call	mul4x_internal
   1003 
   1004 	mov	40(%rsp),%rsi		# restore %rsp
   1005 	mov	\$1,%rax
   1006 	mov	-48(%rsi),%r15
   1007 	mov	-40(%rsi),%r14
   1008 	mov	-32(%rsi),%r13
   1009 	mov	-24(%rsi),%r12
   1010 	mov	-16(%rsi),%rbp
   1011 	mov	-8(%rsi),%rbx
   1012 	lea	(%rsi),%rsp
   1013 .Lpower5_epilogue:
   1014 	ret
   1015 .size	bn_power5,.-bn_power5
   1016 
   1017 .globl	bn_sqr8x_internal
   1018 .hidden	bn_sqr8x_internal
   1019 .type	bn_sqr8x_internal,\@abi-omnipotent
   1020 .align	32
   1021 bn_sqr8x_internal:
   1022 __bn_sqr8x_internal:
   1023 	##############################################################
   1024 	# Squaring part:
   1025 	#
   1026 	# a) multiply-n-add everything but a[i]*a[i];
   1027 	# b) shift result of a) by 1 to the left and accumulate
   1028 	#    a[i]*a[i] products;
   1029 	#
   1030 	##############################################################
   1031 	#                                                     a[1]a[0]
   1032 	#                                                 a[2]a[0]
   1033 	#                                             a[3]a[0]
   1034 	#                                             a[2]a[1]
   1035 	#                                         a[4]a[0]
   1036 	#                                         a[3]a[1]
   1037 	#                                     a[5]a[0]
   1038 	#                                     a[4]a[1]
   1039 	#                                     a[3]a[2]
   1040 	#                                 a[6]a[0]
   1041 	#                                 a[5]a[1]
   1042 	#                                 a[4]a[2]
   1043 	#                             a[7]a[0]
   1044 	#                             a[6]a[1]
   1045 	#                             a[5]a[2]
   1046 	#                             a[4]a[3]
   1047 	#                         a[7]a[1]
   1048 	#                         a[6]a[2]
   1049 	#                         a[5]a[3]
   1050 	#                     a[7]a[2]
   1051 	#                     a[6]a[3]
   1052 	#                     a[5]a[4]
   1053 	#                 a[7]a[3]
   1054 	#                 a[6]a[4]
   1055 	#             a[7]a[4]
   1056 	#             a[6]a[5]
   1057 	#         a[7]a[5]
   1058 	#     a[7]a[6]
   1059 	#                                                     a[1]a[0]
   1060 	#                                                 a[2]a[0]
   1061 	#                                             a[3]a[0]
   1062 	#                                         a[4]a[0]
   1063 	#                                     a[5]a[0]
   1064 	#                                 a[6]a[0]
   1065 	#                             a[7]a[0]
   1066 	#                                             a[2]a[1]
   1067 	#                                         a[3]a[1]
   1068 	#                                     a[4]a[1]
   1069 	#                                 a[5]a[1]
   1070 	#                             a[6]a[1]
   1071 	#                         a[7]a[1]
   1072 	#                                     a[3]a[2]
   1073 	#                                 a[4]a[2]
   1074 	#                             a[5]a[2]
   1075 	#                         a[6]a[2]
   1076 	#                     a[7]a[2]
   1077 	#                             a[4]a[3]
   1078 	#                         a[5]a[3]
   1079 	#                     a[6]a[3]
   1080 	#                 a[7]a[3]
   1081 	#                     a[5]a[4]
   1082 	#                 a[6]a[4]
   1083 	#             a[7]a[4]
   1084 	#             a[6]a[5]
   1085 	#         a[7]a[5]
   1086 	#     a[7]a[6]
   1087 	#                                                         a[0]a[0]
   1088 	#                                                 a[1]a[1]
   1089 	#                                         a[2]a[2]
   1090 	#                                 a[3]a[3]
   1091 	#                         a[4]a[4]
   1092 	#                 a[5]a[5]
   1093 	#         a[6]a[6]
   1094 	# a[7]a[7]
   1095 
   1096 	lea	32(%r10),$i		# $i=-($num-32)
   1097 	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
   1098 
   1099 	mov	$num,$j			# $j=$num
   1100 
   1101 					# comments apply to $num==8 case
   1102 	mov	-32($aptr,$i),$a0	# a[0]
   1103 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1104 	mov	-24($aptr,$i),%rax	# a[1]
   1105 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1106 	mov	-16($aptr,$i),$ai	# a[2]
   1107 	mov	%rax,$a1
   1108 
   1109 	mul	$a0			# a[1]*a[0]
   1110 	mov	%rax,$A0[0]		# a[1]*a[0]
   1111 	 mov	$ai,%rax		# a[2]
   1112 	mov	%rdx,$A0[1]
   1113 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1114 
   1115 	mul	$a0			# a[2]*a[0]
   1116 	add	%rax,$A0[1]
   1117 	 mov	$ai,%rax
   1118 	adc	\$0,%rdx
   1119 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1120 	mov	%rdx,$A0[0]
   1121 
   1122 
   1123 	 mov	-8($aptr,$i),$ai	# a[3]
   1124 	mul	$a1			# a[2]*a[1]
   1125 	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1126 	 mov	$ai,%rax
   1127 	mov	%rdx,$A1[1]
   1128 
   1129 	 lea	($i),$j
   1130 	mul	$a0			# a[3]*a[0]
   1131 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1132 	 mov	$ai,%rax
   1133 	mov	%rdx,$A0[1]
   1134 	adc	\$0,$A0[1]
   1135 	add	$A1[0],$A0[0]
   1136 	adc	\$0,$A0[1]
   1137 	mov	$A0[0],-8($tptr,$j)	# t[3]
   1138 	jmp	.Lsqr4x_1st
   1139 
   1140 .align	32
   1141 .Lsqr4x_1st:
   1142 	 mov	($aptr,$j),$ai		# a[4]
   1143 	mul	$a1			# a[3]*a[1]
   1144 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1145 	 mov	$ai,%rax
   1146 	mov	%rdx,$A1[0]
   1147 	adc	\$0,$A1[0]
   1148 
   1149 	mul	$a0			# a[4]*a[0]
   1150 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1151 	 mov	$ai,%rax		# a[3]
   1152 	 mov	8($aptr,$j),$ai		# a[5]
   1153 	mov	%rdx,$A0[0]
   1154 	adc	\$0,$A0[0]
   1155 	add	$A1[1],$A0[1]
   1156 	adc	\$0,$A0[0]
   1157 
   1158 
   1159 	mul	$a1			# a[4]*a[3]
   1160 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1161 	 mov	$ai,%rax
   1162 	 mov	$A0[1],($tptr,$j)	# t[4]
   1163 	mov	%rdx,$A1[1]
   1164 	adc	\$0,$A1[1]
   1165 
   1166 	mul	$a0			# a[5]*a[2]
   1167 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1168 	 mov	$ai,%rax
   1169 	 mov	16($aptr,$j),$ai	# a[6]
   1170 	mov	%rdx,$A0[1]
   1171 	adc	\$0,$A0[1]
   1172 	add	$A1[0],$A0[0]
   1173 	adc	\$0,$A0[1]
   1174 
   1175 	mul	$a1			# a[5]*a[3]
   1176 	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
   1177 	 mov	$ai,%rax
   1178 	 mov	$A0[0],8($tptr,$j)	# t[5]
   1179 	mov	%rdx,$A1[0]
   1180 	adc	\$0,$A1[0]
   1181 
   1182 	mul	$a0			# a[6]*a[2]
   1183 	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
   1184 	 mov	$ai,%rax		# a[3]
   1185 	 mov	24($aptr,$j),$ai	# a[7]
   1186 	mov	%rdx,$A0[0]
   1187 	adc	\$0,$A0[0]
   1188 	add	$A1[1],$A0[1]
   1189 	adc	\$0,$A0[0]
   1190 
   1191 
   1192 	mul	$a1			# a[6]*a[5]
   1193 	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
   1194 	 mov	$ai,%rax
   1195 	 mov	$A0[1],16($tptr,$j)	# t[6]
   1196 	mov	%rdx,$A1[1]
   1197 	adc	\$0,$A1[1]
   1198 	 lea	32($j),$j
   1199 
   1200 	mul	$a0			# a[7]*a[4]
   1201 	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
   1202 	 mov	$ai,%rax
   1203 	mov	%rdx,$A0[1]
   1204 	adc	\$0,$A0[1]
   1205 	add	$A1[0],$A0[0]
   1206 	adc	\$0,$A0[1]
   1207 	mov	$A0[0],-8($tptr,$j)	# t[7]
   1208 
   1209 	cmp	\$0,$j
   1210 	jne	.Lsqr4x_1st
   1211 
   1212 	mul	$a1			# a[7]*a[5]
   1213 	add	%rax,$A1[1]
   1214 	lea	16($i),$i
   1215 	adc	\$0,%rdx
   1216 	add	$A0[1],$A1[1]
   1217 	adc	\$0,%rdx
   1218 
   1219 	mov	$A1[1],($tptr)		# t[8]
   1220 	mov	%rdx,$A1[0]
   1221 	mov	%rdx,8($tptr)		# t[9]
   1222 	jmp	.Lsqr4x_outer
   1223 
   1224 .align	32
   1225 .Lsqr4x_outer:				# comments apply to $num==6 case
   1226 	mov	-32($aptr,$i),$a0	# a[0]
   1227 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1228 	mov	-24($aptr,$i),%rax	# a[1]
   1229 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1230 	mov	-16($aptr,$i),$ai	# a[2]
   1231 	mov	%rax,$a1
   1232 
   1233 	mul	$a0			# a[1]*a[0]
   1234 	mov	-24($tptr,$i),$A0[0]	# t[1]
   1235 	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
   1236 	 mov	$ai,%rax		# a[2]
   1237 	adc	\$0,%rdx
   1238 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1239 	mov	%rdx,$A0[1]
   1240 
   1241 	mul	$a0			# a[2]*a[0]
   1242 	add	%rax,$A0[1]
   1243 	 mov	$ai,%rax
   1244 	adc	\$0,%rdx
   1245 	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
   1246 	mov	%rdx,$A0[0]
   1247 	adc	\$0,$A0[0]
   1248 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1249 
   1250 	xor	$A1[0],$A1[0]
   1251 
   1252 	 mov	-8($aptr,$i),$ai	# a[3]
   1253 	mul	$a1			# a[2]*a[1]
   1254 	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1255 	 mov	$ai,%rax
   1256 	adc	\$0,%rdx
   1257 	add	-8($tptr,$i),$A1[0]
   1258 	mov	%rdx,$A1[1]
   1259 	adc	\$0,$A1[1]
   1260 
   1261 	mul	$a0			# a[3]*a[0]
   1262 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1263 	 mov	$ai,%rax
   1264 	adc	\$0,%rdx
   1265 	add	$A1[0],$A0[0]
   1266 	mov	%rdx,$A0[1]
   1267 	adc	\$0,$A0[1]
   1268 	mov	$A0[0],-8($tptr,$i)	# t[3]
   1269 
   1270 	lea	($i),$j
   1271 	jmp	.Lsqr4x_inner
   1272 
   1273 .align	32
   1274 .Lsqr4x_inner:
   1275 	 mov	($aptr,$j),$ai		# a[4]
   1276 	mul	$a1			# a[3]*a[1]
   1277 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1278 	 mov	$ai,%rax
   1279 	mov	%rdx,$A1[0]
   1280 	adc	\$0,$A1[0]
   1281 	add	($tptr,$j),$A1[1]
   1282 	adc	\$0,$A1[0]
   1283 
   1284 	.byte	0x67
   1285 	mul	$a0			# a[4]*a[0]
   1286 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1287 	 mov	$ai,%rax		# a[3]
   1288 	 mov	8($aptr,$j),$ai		# a[5]
   1289 	mov	%rdx,$A0[0]
   1290 	adc	\$0,$A0[0]
   1291 	add	$A1[1],$A0[1]
   1292 	adc	\$0,$A0[0]
   1293 
   1294 	mul	$a1			# a[4]*a[3]
   1295 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1296 	mov	$A0[1],($tptr,$j)	# t[4]
   1297 	 mov	$ai,%rax
   1298 	mov	%rdx,$A1[1]
   1299 	adc	\$0,$A1[1]
   1300 	add	8($tptr,$j),$A1[0]
   1301 	lea	16($j),$j		# j++
   1302 	adc	\$0,$A1[1]
   1303 
   1304 	mul	$a0			# a[5]*a[2]
   1305 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1306 	 mov	$ai,%rax
   1307 	adc	\$0,%rdx
   1308 	add	$A1[0],$A0[0]
   1309 	mov	%rdx,$A0[1]
   1310 	adc	\$0,$A0[1]
   1311 	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
   1312 
   1313 	cmp	\$0,$j
   1314 	jne	.Lsqr4x_inner
   1315 
   1316 	.byte	0x67
   1317 	mul	$a1			# a[5]*a[3]
   1318 	add	%rax,$A1[1]
   1319 	adc	\$0,%rdx
   1320 	add	$A0[1],$A1[1]
   1321 	adc	\$0,%rdx
   1322 
   1323 	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
   1324 	mov	%rdx,$A1[0]
   1325 	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
   1326 
   1327 	add	\$16,$i
   1328 	jnz	.Lsqr4x_outer
   1329 
   1330 					# comments apply to $num==4 case
   1331 	mov	-32($aptr),$a0		# a[0]
   1332 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1333 	mov	-24($aptr),%rax		# a[1]
   1334 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1335 	mov	-16($aptr),$ai		# a[2]
   1336 	mov	%rax,$a1
   1337 
   1338 	mul	$a0			# a[1]*a[0]
   1339 	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
   1340 	 mov	$ai,%rax		# a[2]
   1341 	mov	%rdx,$A0[1]
   1342 	adc	\$0,$A0[1]
   1343 
   1344 	mul	$a0			# a[2]*a[0]
   1345 	add	%rax,$A0[1]
   1346 	 mov	$ai,%rax
   1347 	 mov	$A0[0],-24($tptr)	# t[1]
   1348 	mov	%rdx,$A0[0]
   1349 	adc	\$0,$A0[0]
   1350 	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
   1351 	 mov	-8($aptr),$ai		# a[3]
   1352 	adc	\$0,$A0[0]
   1353 
   1354 	mul	$a1			# a[2]*a[1]
   1355 	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
   1356 	 mov	$ai,%rax
   1357 	 mov	$A0[1],-16($tptr)	# t[2]
   1358 	mov	%rdx,$A1[1]
   1359 	adc	\$0,$A1[1]
   1360 
   1361 	mul	$a0			# a[3]*a[0]
   1362 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1363 	 mov	$ai,%rax
   1364 	mov	%rdx,$A0[1]
   1365 	adc	\$0,$A0[1]
   1366 	add	$A1[0],$A0[0]
   1367 	adc	\$0,$A0[1]
   1368 	mov	$A0[0],-8($tptr)	# t[3]
   1369 
   1370 	mul	$a1			# a[3]*a[1]
   1371 	add	%rax,$A1[1]
   1372 	 mov	-16($aptr),%rax		# a[2]
   1373 	adc	\$0,%rdx
   1374 	add	$A0[1],$A1[1]
   1375 	adc	\$0,%rdx
   1376 
   1377 	mov	$A1[1],($tptr)		# t[4]
   1378 	mov	%rdx,$A1[0]
   1379 	mov	%rdx,8($tptr)		# t[5]
   1380 
   1381 	mul	$ai			# a[2]*a[3]
   1382 ___
   1383 {
   1384 my ($shift,$carry)=($a0,$a1);
   1385 my @S=(@A1,$ai,$n0);
   1386 $code.=<<___;
   1387 	 add	\$16,$i
   1388 	 xor	$shift,$shift
   1389 	 sub	$num,$i			# $i=16-$num
   1390 	 xor	$carry,$carry
   1391 
   1392 	add	$A1[0],%rax		# t[5]
   1393 	adc	\$0,%rdx
   1394 	mov	%rax,8($tptr)		# t[5]
   1395 	mov	%rdx,16($tptr)		# t[6]
   1396 	mov	$carry,24($tptr)	# t[7]
   1397 
   1398 	 mov	-16($aptr,$i),%rax	# a[0]
   1399 	lea	48+8(%rsp),$tptr
   1400 	 xor	$A0[0],$A0[0]		# t[0]
   1401 	 mov	8($tptr),$A0[1]		# t[1]
   1402 
   1403 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1404 	shr	\$63,$A0[0]
   1405 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1406 	shr	\$63,$A0[1]
   1407 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1408 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1409 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1410 	mul	%rax			# a[i]*a[i]
   1411 	neg	$carry			# mov $carry,cf
   1412 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1413 	adc	%rax,$S[0]
   1414 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1415 	mov	$S[0],($tptr)
   1416 	adc	%rdx,$S[1]
   1417 
   1418 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1419 	 mov	$S[1],8($tptr)
   1420 	 sbb	$carry,$carry		# mov cf,$carry
   1421 	shr	\$63,$A0[0]
   1422 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1423 	shr	\$63,$A0[1]
   1424 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1425 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1426 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1427 	mul	%rax			# a[i]*a[i]
   1428 	neg	$carry			# mov $carry,cf
   1429 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1430 	adc	%rax,$S[2]
   1431 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1432 	mov	$S[2],16($tptr)
   1433 	adc	%rdx,$S[3]
   1434 	lea	16($i),$i
   1435 	mov	$S[3],24($tptr)
   1436 	sbb	$carry,$carry		# mov cf,$carry
   1437 	lea	64($tptr),$tptr
   1438 	jmp	.Lsqr4x_shift_n_add
   1439 
   1440 .align	32
   1441 .Lsqr4x_shift_n_add:
   1442 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1443 	shr	\$63,$A0[0]
   1444 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1445 	shr	\$63,$A0[1]
   1446 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1447 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1448 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1449 	mul	%rax			# a[i]*a[i]
   1450 	neg	$carry			# mov $carry,cf
   1451 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1452 	adc	%rax,$S[0]
   1453 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1454 	mov	$S[0],-32($tptr)
   1455 	adc	%rdx,$S[1]
   1456 
   1457 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1458 	 mov	$S[1],-24($tptr)
   1459 	 sbb	$carry,$carry		# mov cf,$carry
   1460 	shr	\$63,$A0[0]
   1461 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1462 	shr	\$63,$A0[1]
   1463 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1464 	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
   1465 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1466 	mul	%rax			# a[i]*a[i]
   1467 	neg	$carry			# mov $carry,cf
   1468 	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
   1469 	adc	%rax,$S[2]
   1470 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1471 	mov	$S[2],-16($tptr)
   1472 	adc	%rdx,$S[3]
   1473 
   1474 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1475 	 mov	$S[3],-8($tptr)
   1476 	 sbb	$carry,$carry		# mov cf,$carry
   1477 	shr	\$63,$A0[0]
   1478 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1479 	shr	\$63,$A0[1]
   1480 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1481 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1482 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1483 	mul	%rax			# a[i]*a[i]
   1484 	neg	$carry			# mov $carry,cf
   1485 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1486 	adc	%rax,$S[0]
   1487 	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
   1488 	mov	$S[0],0($tptr)
   1489 	adc	%rdx,$S[1]
   1490 
   1491 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1492 	 mov	$S[1],8($tptr)
   1493 	 sbb	$carry,$carry		# mov cf,$carry
   1494 	shr	\$63,$A0[0]
   1495 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1496 	shr	\$63,$A0[1]
   1497 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1498 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1499 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1500 	mul	%rax			# a[i]*a[i]
   1501 	neg	$carry			# mov $carry,cf
   1502 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1503 	adc	%rax,$S[2]
   1504 	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
   1505 	mov	$S[2],16($tptr)
   1506 	adc	%rdx,$S[3]
   1507 	mov	$S[3],24($tptr)
   1508 	sbb	$carry,$carry		# mov cf,$carry
   1509 	lea	64($tptr),$tptr
   1510 	add	\$32,$i
   1511 	jnz	.Lsqr4x_shift_n_add
   1512 
   1513 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1514 	.byte	0x67
   1515 	shr	\$63,$A0[0]
   1516 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1517 	shr	\$63,$A0[1]
   1518 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1519 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1520 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1521 	mul	%rax			# a[i]*a[i]
   1522 	neg	$carry			# mov $carry,cf
   1523 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1524 	adc	%rax,$S[0]
   1525 	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
   1526 	mov	$S[0],-32($tptr)
   1527 	adc	%rdx,$S[1]
   1528 
   1529 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
   1530 	 mov	$S[1],-24($tptr)
   1531 	 sbb	$carry,$carry		# mov cf,$carry
   1532 	shr	\$63,$A0[0]
   1533 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1534 	shr	\$63,$A0[1]
   1535 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1536 	mul	%rax			# a[i]*a[i]
   1537 	neg	$carry			# mov $carry,cf
   1538 	adc	%rax,$S[2]
   1539 	adc	%rdx,$S[3]
   1540 	mov	$S[2],-16($tptr)
   1541 	mov	$S[3],-8($tptr)
   1542 ___
   1543 }
   1545 ######################################################################
   1546 # Montgomery reduction part, "word-by-word" algorithm.
   1547 #
   1548 # This new path is inspired by multiple submissions from Intel, by
   1549 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   1550 # Vinodh Gopal...
   1551 {
   1552 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
   1553 
   1554 $code.=<<___;
   1555 	movq	%xmm2,$nptr
   1556 sqr8x_reduction:
   1557 	xor	%rax,%rax
   1558 	lea	($nptr,$num,2),%rcx	# end of n[]
   1559 	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
   1560 	mov	%rcx,0+8(%rsp)
   1561 	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
   1562 	mov	%rdx,8+8(%rsp)
   1563 	neg	$num
   1564 	jmp	.L8x_reduction_loop
   1565 
   1566 .align	32
   1567 .L8x_reduction_loop:
   1568 	lea	($tptr,$num),$tptr	# start of current t[] window
   1569 	.byte	0x66
   1570 	mov	8*0($tptr),$m0
   1571 	mov	8*1($tptr),%r9
   1572 	mov	8*2($tptr),%r10
   1573 	mov	8*3($tptr),%r11
   1574 	mov	8*4($tptr),%r12
   1575 	mov	8*5($tptr),%r13
   1576 	mov	8*6($tptr),%r14
   1577 	mov	8*7($tptr),%r15
   1578 	mov	%rax,(%rdx)		# store top-most carry bit
   1579 	lea	8*8($tptr),$tptr
   1580 
   1581 	.byte	0x67
   1582 	mov	$m0,%r8
   1583 	imulq	32+8(%rsp),$m0		# n0*a[0]
   1584 	mov	16*0($nptr),%rax	# n[0]
   1585 	mov	\$8,%ecx
   1586 	jmp	.L8x_reduce
   1587 
   1588 .align	32
   1589 .L8x_reduce:
   1590 	mulq	$m0
   1591 	 mov	16*1($nptr),%rax	# n[1]
   1592 	neg	%r8
   1593 	mov	%rdx,%r8
   1594 	adc	\$0,%r8
   1595 
   1596 	mulq	$m0
   1597 	add	%rax,%r9
   1598 	 mov	16*2($nptr),%rax
   1599 	adc	\$0,%rdx
   1600 	add	%r9,%r8
   1601 	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
   1602 	mov	%rdx,%r9
   1603 	adc	\$0,%r9
   1604 
   1605 	mulq	$m0
   1606 	add	%rax,%r10
   1607 	 mov	16*3($nptr),%rax
   1608 	adc	\$0,%rdx
   1609 	add	%r10,%r9
   1610 	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
   1611 	mov	%rdx,%r10
   1612 	adc	\$0,%r10
   1613 
   1614 	mulq	$m0
   1615 	add	%rax,%r11
   1616 	 mov	16*4($nptr),%rax
   1617 	adc	\$0,%rdx
   1618 	 imulq	%r8,$carry		# modulo-scheduled
   1619 	add	%r11,%r10
   1620 	mov	%rdx,%r11
   1621 	adc	\$0,%r11
   1622 
   1623 	mulq	$m0
   1624 	add	%rax,%r12
   1625 	 mov	16*5($nptr),%rax
   1626 	adc	\$0,%rdx
   1627 	add	%r12,%r11
   1628 	mov	%rdx,%r12
   1629 	adc	\$0,%r12
   1630 
   1631 	mulq	$m0
   1632 	add	%rax,%r13
   1633 	 mov	16*6($nptr),%rax
   1634 	adc	\$0,%rdx
   1635 	add	%r13,%r12
   1636 	mov	%rdx,%r13
   1637 	adc	\$0,%r13
   1638 
   1639 	mulq	$m0
   1640 	add	%rax,%r14
   1641 	 mov	16*7($nptr),%rax
   1642 	adc	\$0,%rdx
   1643 	add	%r14,%r13
   1644 	mov	%rdx,%r14
   1645 	adc	\$0,%r14
   1646 
   1647 	mulq	$m0
   1648 	 mov	$carry,$m0		# n0*a[i]
   1649 	add	%rax,%r15
   1650 	 mov	16*0($nptr),%rax	# n[0]
   1651 	adc	\$0,%rdx
   1652 	add	%r15,%r14
   1653 	mov	%rdx,%r15
   1654 	adc	\$0,%r15
   1655 
   1656 	dec	%ecx
   1657 	jnz	.L8x_reduce
   1658 
   1659 	lea	16*8($nptr),$nptr
   1660 	xor	%rax,%rax
   1661 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1662 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1663 	jae	.L8x_no_tail
   1664 
   1665 	.byte	0x66
   1666 	add	8*0($tptr),%r8
   1667 	adc	8*1($tptr),%r9
   1668 	adc	8*2($tptr),%r10
   1669 	adc	8*3($tptr),%r11
   1670 	adc	8*4($tptr),%r12
   1671 	adc	8*5($tptr),%r13
   1672 	adc	8*6($tptr),%r14
   1673 	adc	8*7($tptr),%r15
   1674 	sbb	$carry,$carry		# top carry
   1675 
   1676 	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1677 	mov	\$8,%ecx
   1678 	mov	16*0($nptr),%rax
   1679 	jmp	.L8x_tail
   1680 
   1681 .align	32
   1682 .L8x_tail:
   1683 	mulq	$m0
   1684 	add	%rax,%r8
   1685 	 mov	16*1($nptr),%rax
   1686 	 mov	%r8,($tptr)		# save result
   1687 	mov	%rdx,%r8
   1688 	adc	\$0,%r8
   1689 
   1690 	mulq	$m0
   1691 	add	%rax,%r9
   1692 	 mov	16*2($nptr),%rax
   1693 	adc	\$0,%rdx
   1694 	add	%r9,%r8
   1695 	 lea	8($tptr),$tptr		# $tptr++
   1696 	mov	%rdx,%r9
   1697 	adc	\$0,%r9
   1698 
   1699 	mulq	$m0
   1700 	add	%rax,%r10
   1701 	 mov	16*3($nptr),%rax
   1702 	adc	\$0,%rdx
   1703 	add	%r10,%r9
   1704 	mov	%rdx,%r10
   1705 	adc	\$0,%r10
   1706 
   1707 	mulq	$m0
   1708 	add	%rax,%r11
   1709 	 mov	16*4($nptr),%rax
   1710 	adc	\$0,%rdx
   1711 	add	%r11,%r10
   1712 	mov	%rdx,%r11
   1713 	adc	\$0,%r11
   1714 
   1715 	mulq	$m0
   1716 	add	%rax,%r12
   1717 	 mov	16*5($nptr),%rax
   1718 	adc	\$0,%rdx
   1719 	add	%r12,%r11
   1720 	mov	%rdx,%r12
   1721 	adc	\$0,%r12
   1722 
   1723 	mulq	$m0
   1724 	add	%rax,%r13
   1725 	 mov	16*6($nptr),%rax
   1726 	adc	\$0,%rdx
   1727 	add	%r13,%r12
   1728 	mov	%rdx,%r13
   1729 	adc	\$0,%r13
   1730 
   1731 	mulq	$m0
   1732 	add	%rax,%r14
   1733 	 mov	16*7($nptr),%rax
   1734 	adc	\$0,%rdx
   1735 	add	%r14,%r13
   1736 	mov	%rdx,%r14
   1737 	adc	\$0,%r14
   1738 
   1739 	mulq	$m0
   1740 	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
   1741 	add	%rax,%r15
   1742 	adc	\$0,%rdx
   1743 	add	%r15,%r14
   1744 	 mov	16*0($nptr),%rax	# pull n[0]
   1745 	mov	%rdx,%r15
   1746 	adc	\$0,%r15
   1747 
   1748 	dec	%ecx
   1749 	jnz	.L8x_tail
   1750 
   1751 	lea	16*8($nptr),$nptr
   1752 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1753 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1754 	jae	.L8x_tail_done		# break out of loop
   1755 
   1756 	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1757 	neg	$carry
   1758 	 mov	8*0($nptr),%rax		# pull n[0]
   1759 	adc	8*0($tptr),%r8
   1760 	adc	8*1($tptr),%r9
   1761 	adc	8*2($tptr),%r10
   1762 	adc	8*3($tptr),%r11
   1763 	adc	8*4($tptr),%r12
   1764 	adc	8*5($tptr),%r13
   1765 	adc	8*6($tptr),%r14
   1766 	adc	8*7($tptr),%r15
   1767 	sbb	$carry,$carry		# top carry
   1768 
   1769 	mov	\$8,%ecx
   1770 	jmp	.L8x_tail
   1771 
   1772 .align	32
   1773 .L8x_tail_done:
   1774 	add	(%rdx),%r8		# can this overflow?
   1775 	adc	\$0,%r9
   1776 	adc	\$0,%r10
   1777 	adc	\$0,%r11
   1778 	adc	\$0,%r12
   1779 	adc	\$0,%r13
   1780 	adc	\$0,%r14
   1781 	adc	\$0,%r15		# can't overflow, because we
   1782 					# started with "overhung" part
   1783 					# of multiplication
   1784 	xor	%rax,%rax
   1785 
   1786 	neg	$carry
   1787 .L8x_no_tail:
   1788 	adc	8*0($tptr),%r8
   1789 	adc	8*1($tptr),%r9
   1790 	adc	8*2($tptr),%r10
   1791 	adc	8*3($tptr),%r11
   1792 	adc	8*4($tptr),%r12
   1793 	adc	8*5($tptr),%r13
   1794 	adc	8*6($tptr),%r14
   1795 	adc	8*7($tptr),%r15
   1796 	adc	\$0,%rax		# top-most carry
   1797 	 mov	-16($nptr),%rcx		# np[num-1]
   1798 	 xor	$carry,$carry
   1799 
   1800 	movq	%xmm2,$nptr		# restore $nptr
   1801 
   1802 	mov	%r8,8*0($tptr)		# store top 512 bits
   1803 	mov	%r9,8*1($tptr)
   1804 	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
   1805 	mov	%r10,8*2($tptr)
   1806 	mov	%r11,8*3($tptr)
   1807 	mov	%r12,8*4($tptr)
   1808 	mov	%r13,8*5($tptr)
   1809 	mov	%r14,8*6($tptr)
   1810 	mov	%r15,8*7($tptr)
   1811 	lea	8*8($tptr),$tptr
   1812 
   1813 	cmp	%rdx,$tptr		# end of t[]?
   1814 	jb	.L8x_reduction_loop
   1815 ___
   1816 }
   1818 ##############################################################
   1819 # Post-condition, 4x unrolled
   1820 #
   1821 {
   1822 my ($tptr,$nptr)=("%rbx","%rbp");
   1823 $code.=<<___;
   1824 	#xor	%rsi,%rsi		# %rsi was $carry above
   1825 	sub	%r15,%rcx		# compare top-most words
   1826 	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
   1827 	adc	%rsi,%rsi
   1828 	mov	$num,%rcx
   1829 	or	%rsi,%rax
   1830 	movq	%xmm1,$rptr		# restore $rptr
   1831 	xor	\$1,%rax
   1832 	movq	%xmm1,$aptr		# prepare for back-to-back call
   1833 	lea	($nptr,%rax,8),$nptr
   1834 	sar	\$3+2,%rcx		# cf=0
   1835 	jmp	.Lsqr4x_sub
   1836 
   1837 .align	32
   1838 .Lsqr4x_sub:
   1839 	.byte	0x66
   1840 	mov	8*0($tptr),%r12
   1841 	mov	8*1($tptr),%r13
   1842 	sbb	16*0($nptr),%r12
   1843 	mov	8*2($tptr),%r14
   1844 	sbb	16*1($nptr),%r13
   1845 	mov	8*3($tptr),%r15
   1846 	lea	8*4($tptr),$tptr
   1847 	sbb	16*2($nptr),%r14
   1848 	mov	%r12,8*0($rptr)
   1849 	sbb	16*3($nptr),%r15
   1850 	lea	16*4($nptr),$nptr
   1851 	mov	%r13,8*1($rptr)
   1852 	mov	%r14,8*2($rptr)
   1853 	mov	%r15,8*3($rptr)
   1854 	lea	8*4($rptr),$rptr
   1855 
   1856 	inc	%rcx			# pass %cf
   1857 	jnz	.Lsqr4x_sub
   1858 ___
   1859 }
   1860 $code.=<<___;
   1861 	mov	$num,%r10		# prepare for back-to-back call
   1862 	neg	$num			# restore $num	
   1863 	ret
   1864 .size	bn_sqr8x_internal,.-bn_sqr8x_internal
   1865 ___
   1866 {
   1867 $code.=<<___;
   1868 .globl	bn_from_montgomery
   1869 .type	bn_from_montgomery,\@abi-omnipotent
   1870 .align	32
   1871 bn_from_montgomery:
   1872 	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
   1873 	jz	bn_from_mont8x
   1874 	xor	%eax,%eax
   1875 	ret
   1876 .size	bn_from_montgomery,.-bn_from_montgomery
   1877 
   1878 .type	bn_from_mont8x,\@function,6
   1879 .align	32
   1880 bn_from_mont8x:
   1881 	.byte	0x67
   1882 	mov	%rsp,%rax
   1883 	push	%rbx
   1884 	push	%rbp
   1885 	push	%r12
   1886 	push	%r13
   1887 	push	%r14
   1888 	push	%r15
   1889 ___
   1890 $code.=<<___ if ($win64);
   1891 	lea	-0x28(%rsp),%rsp
   1892 	movaps	%xmm6,(%rsp)
   1893 	movaps	%xmm7,0x10(%rsp)
   1894 ___
   1895 $code.=<<___;
   1896 	.byte	0x67
   1897 	mov	${num}d,%r10d
   1898 	shl	\$3,${num}d		# convert $num to bytes
   1899 	shl	\$3+2,%r10d		# 4*$num
   1900 	neg	$num
   1901 	mov	($n0),$n0		# *n0
   1902 
   1903 	##############################################################
   1904 	# ensure that stack frame doesn't alias with $aptr+4*$num
   1905 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
   1906 	# (see bn_exp.c). this is done to allow memory disambiguation
   1907 	# logic do its magic.
   1908 	#
   1909 	lea	-64(%rsp,$num,2),%r11
   1910 	sub	$aptr,%r11
   1911 	and	\$4095,%r11
   1912 	cmp	%r11,%r10
   1913 	jb	.Lfrom_sp_alt
   1914 	sub	%r11,%rsp		# align with $aptr
   1915 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   1916 	jmp	.Lfrom_sp_done
   1917 
   1918 .align	32
   1919 .Lfrom_sp_alt:
   1920 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
   1921 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   1922 	sub	%r10,%r11
   1923 	mov	\$0,%r10
   1924 	cmovc	%r10,%r11
   1925 	sub	%r11,%rsp
   1926 .Lfrom_sp_done:
   1927 	and	\$-64,%rsp
   1928 	mov	$num,%r10	
   1929 	neg	$num
   1930 
   1931 	##############################################################
   1932 	# Stack layout
   1933 	#
   1934 	# +0	saved $num, used in reduction section
   1935 	# +8	&t[2*$num], used in reduction section
   1936 	# +32	saved *n0
   1937 	# +40	saved %rsp
   1938 	# +48	t[2*$num]
   1939 	#
   1940 	mov	$n0,  32(%rsp)
   1941 	mov	%rax, 40(%rsp)		# save original %rsp
   1942 .Lfrom_body:
   1943 	mov	$num,%r11
   1944 	lea	48(%rsp),%rax
   1945 	pxor	%xmm0,%xmm0
   1946 	jmp	.Lmul_by_1
   1947 
   1948 .align	32
   1949 .Lmul_by_1:
   1950 	movdqu	($aptr),%xmm1
   1951 	movdqu	16($aptr),%xmm2
   1952 	movdqu	32($aptr),%xmm3
   1953 	movdqa	%xmm0,(%rax,$num)
   1954 	movdqu	48($aptr),%xmm4
   1955 	movdqa	%xmm0,16(%rax,$num)
   1956 	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
   1957 	movdqa	%xmm1,(%rax)
   1958 	movdqa	%xmm0,32(%rax,$num)
   1959 	movdqa	%xmm2,16(%rax)
   1960 	movdqa	%xmm0,48(%rax,$num)
   1961 	movdqa	%xmm3,32(%rax)
   1962 	movdqa	%xmm4,48(%rax)
   1963 	lea	64(%rax),%rax
   1964 	sub	\$64,%r11
   1965 	jnz	.Lmul_by_1
   1966 
   1967 	movq	$rptr,%xmm1
   1968 	movq	$nptr,%xmm2
   1969 	.byte	0x67
   1970 	mov	$nptr,%rbp
   1971 	movq	%r10, %xmm3		# -num
   1972 ___
   1973 $code.=<<___ if ($addx);
   1974 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
   1975 	and	\$0x80100,%r11d
   1976 	cmp	\$0x80100,%r11d
   1977 	jne	.Lfrom_mont_nox
   1978 
   1979 	lea	(%rax,$num),$rptr
   1980 	call	sqrx8x_reduction
   1981 
   1982 	pxor	%xmm0,%xmm0
   1983 	lea	48(%rsp),%rax
   1984 	mov	40(%rsp),%rsi		# restore %rsp
   1985 	jmp	.Lfrom_mont_zero
   1986 
   1987 .align	32
   1988 .Lfrom_mont_nox:
   1989 ___
   1990 $code.=<<___;
   1991 	call	sqr8x_reduction
   1992 
   1993 	pxor	%xmm0,%xmm0
   1994 	lea	48(%rsp),%rax
   1995 	mov	40(%rsp),%rsi		# restore %rsp
   1996 	jmp	.Lfrom_mont_zero
   1997 
   1998 .align	32
   1999 .Lfrom_mont_zero:
   2000 	movdqa	%xmm0,16*0(%rax)
   2001 	movdqa	%xmm0,16*1(%rax)
   2002 	movdqa	%xmm0,16*2(%rax)
   2003 	movdqa	%xmm0,16*3(%rax)
   2004 	lea	16*4(%rax),%rax
   2005 	sub	\$32,$num
   2006 	jnz	.Lfrom_mont_zero
   2007 
   2008 	mov	\$1,%rax
   2009 	mov	-48(%rsi),%r15
   2010 	mov	-40(%rsi),%r14
   2011 	mov	-32(%rsi),%r13
   2012 	mov	-24(%rsi),%r12
   2013 	mov	-16(%rsi),%rbp
   2014 	mov	-8(%rsi),%rbx
   2015 	lea	(%rsi),%rsp
   2016 .Lfrom_epilogue:
   2017 	ret
   2018 .size	bn_from_mont8x,.-bn_from_mont8x
   2019 ___
   2020 }
   2021 }}}
   2022 
   2024 if ($addx) {{{
   2025 my $bp="%rdx";	# restore original value
   2026 
   2027 $code.=<<___;
   2028 .type	bn_mulx4x_mont_gather5,\@function,6
   2029 .align	32
   2030 bn_mulx4x_mont_gather5:
   2031 .Lmulx4x_enter:
   2032 	.byte	0x67
   2033 	mov	%rsp,%rax
   2034 	push	%rbx
   2035 	push	%rbp
   2036 	push	%r12
   2037 	push	%r13
   2038 	push	%r14
   2039 	push	%r15
   2040 ___
   2041 $code.=<<___ if ($win64);
   2042 	lea	-0x28(%rsp),%rsp
   2043 	movaps	%xmm6,(%rsp)
   2044 	movaps	%xmm7,0x10(%rsp)
   2045 ___
   2046 $code.=<<___;
   2047 	.byte	0x67
   2048 	mov	${num}d,%r10d
   2049 	shl	\$3,${num}d		# convert $num to bytes
   2050 	shl	\$3+2,%r10d		# 4*$num
   2051 	neg	$num			# -$num
   2052 	mov	($n0),$n0		# *n0
   2053 
   2054 	##############################################################
   2055 	# ensure that stack frame doesn't alias with $aptr+4*$num
   2056 	# modulo 4096, which covers a[num], ret[num] and n[2*num]
   2057 	# (see bn_exp.c). this is done to allow memory disambiguation
   2058 	# logic do its magic. [excessive frame is allocated in order
   2059 	# to allow bn_from_mont8x to clear it.]
   2060 	#
   2061 	lea	-64(%rsp,$num,2),%r11
   2062 	sub	$ap,%r11
   2063 	and	\$4095,%r11
   2064 	cmp	%r11,%r10
   2065 	jb	.Lmulx4xsp_alt
   2066 	sub	%r11,%rsp		# align with $aptr
   2067 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
   2068 	jmp	.Lmulx4xsp_done
   2069 
   2070 .align	32
   2071 .Lmulx4xsp_alt:
   2072 	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
   2073 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
   2074 	sub	%r10,%r11
   2075 	mov	\$0,%r10
   2076 	cmovc	%r10,%r11
   2077 	sub	%r11,%rsp
   2078 .Lmulx4xsp_done:	
   2079 	and	\$-64,%rsp		# ensure alignment
   2080 	##############################################################
   2081 	# Stack layout
   2082 	# +0	-num
   2083 	# +8	off-loaded &b[i]
   2084 	# +16	end of b[num]
   2085 	# +24	inner counter
   2086 	# +32	saved n0
   2087 	# +40	saved %rsp
   2088 	# +48
   2089 	# +56	saved rp
   2090 	# +64	tmp[num+1]
   2091 	#
   2092 	mov	$n0, 32(%rsp)		# save *n0
   2093 	mov	%rax,40(%rsp)		# save original %rsp
   2094 .Lmulx4x_body:
   2095 	call	mulx4x_internal
   2096 
   2097 	mov	40(%rsp),%rsi		# restore %rsp
   2098 	mov	\$1,%rax
   2099 ___
   2100 $code.=<<___ if ($win64);
   2101 	movaps	-88(%rsi),%xmm6
   2102 	movaps	-72(%rsi),%xmm7
   2103 ___
   2104 $code.=<<___;
   2105 	mov	-48(%rsi),%r15
   2106 	mov	-40(%rsi),%r14
   2107 	mov	-32(%rsi),%r13
   2108 	mov	-24(%rsi),%r12
   2109 	mov	-16(%rsi),%rbp
   2110 	mov	-8(%rsi),%rbx
   2111 	lea	(%rsi),%rsp
   2112 .Lmulx4x_epilogue:
   2113 	ret
   2114 .size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
   2115 
   2116 .type	mulx4x_internal,\@abi-omnipotent
   2117 .align	32
   2118 mulx4x_internal:
   2119 	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
   2120 	.byte	0x67
   2121 	neg	$num			# restore $num
   2122 	shl	\$5,$num
   2123 	lea	256($bp,$num),%r13
   2124 	shr	\$5+5,$num
   2125 	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
   2126 	sub	\$1,$num
   2127 	mov	%r13,16+8(%rsp)		# end of b[num]
   2128 	mov	$num,24+8(%rsp)		# inner counter
   2129 	mov	$rp, 56+8(%rsp)		# save $rp
   2130 ___
   2131 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
   2132    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
   2133 my $rptr=$bptr;
   2134 my $STRIDE=2**5*8;		# 5 is "window size"
   2135 my $N=$STRIDE/4;		# should match cache line size
   2136 $code.=<<___;
   2137 	mov	%r10,%r11
   2138 	shr	\$`log($N/8)/log(2)`,%r10
   2139 	and	\$`$N/8-1`,%r11
   2140 	not	%r10
   2141 	lea	.Lmagic_masks(%rip),%rax
   2142 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
   2143 	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
   2144 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
   2145 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
   2146 	add	\$7,%r11
   2147 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
   2148 	movq	24(%rax,%r10,8),%xmm7
   2149 	and	\$7,%r11
   2150 
   2151 	movq	`0*$STRIDE/4-96`($bptr),%xmm0
   2152 	lea	$STRIDE($bptr),$tptr	# borrow $tptr
   2153 	movq	`1*$STRIDE/4-96`($bptr),%xmm1
   2154 	pand	%xmm4,%xmm0
   2155 	movq	`2*$STRIDE/4-96`($bptr),%xmm2
   2156 	pand	%xmm5,%xmm1
   2157 	movq	`3*$STRIDE/4-96`($bptr),%xmm3
   2158 	pand	%xmm6,%xmm2
   2159 	por	%xmm1,%xmm0
   2160 	movq	`0*$STRIDE/4-96`($tptr),%xmm1
   2161 	pand	%xmm7,%xmm3
   2162 	por	%xmm2,%xmm0
   2163 	movq	`1*$STRIDE/4-96`($tptr),%xmm2
   2164 	por	%xmm3,%xmm0
   2165 	.byte	0x67,0x67
   2166 	pand	%xmm4,%xmm1
   2167 	movq	`2*$STRIDE/4-96`($tptr),%xmm3
   2168 
   2169 	movq	%xmm0,%rdx		# bp[0]
   2170 	movq	`3*$STRIDE/4-96`($tptr),%xmm0
   2171 	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
   2172 	pand	%xmm5,%xmm2
   2173 	.byte	0x67,0x67
   2174 	pand	%xmm6,%xmm3
   2175 	##############################################################
   2176 	# $tptr is chosen so that writing to top-most element of the
   2177 	# vector occurs just "above" references to powers table,
   2178 	# "above" modulo cache-line size, which effectively precludes
   2179 	# possibility of memory disambiguation logic failure when
   2180 	# accessing the table.
   2181 	# 
   2182 	lea	64+8*4+8(%rsp,%r11,8),$tptr
   2183 
   2184 	mov	%rdx,$bi
   2185 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
   2186 	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
   2187 	add	%rax,%r11
   2188 	mulx	2*8($aptr),%rax,%r13	# ...
   2189 	adc	%rax,%r12
   2190 	adc	\$0,%r13
   2191 	mulx	3*8($aptr),%rax,%r14
   2192 
   2193 	mov	$mi,%r15
   2194 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2195 	xor	$zero,$zero		# cf=0, of=0
   2196 	mov	$mi,%rdx
   2197 
   2198 	por	%xmm2,%xmm1
   2199 	pand	%xmm7,%xmm0
   2200 	por	%xmm3,%xmm1
   2201 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2202 	por	%xmm1,%xmm0
   2203 
   2204 	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
   2205 	adcx	%rax,%r13
   2206 	adcx	$zero,%r14		# cf=0
   2207 
   2208 	mulx	0*16($nptr),%rax,%r10
   2209 	adcx	%rax,%r15		# discarded
   2210 	adox	%r11,%r10
   2211 	mulx	1*16($nptr),%rax,%r11
   2212 	adcx	%rax,%r10
   2213 	adox	%r12,%r11
   2214 	mulx	2*16($nptr),%rax,%r12
   2215 	mov	24+8(%rsp),$bptr	# counter value
   2216 	.byte	0x66
   2217 	mov	%r10,-8*4($tptr)
   2218 	adcx	%rax,%r11
   2219 	adox	%r13,%r12
   2220 	mulx	3*16($nptr),%rax,%r15
   2221 	 .byte	0x67,0x67
   2222 	 mov	$bi,%rdx
   2223 	mov	%r11,-8*3($tptr)
   2224 	adcx	%rax,%r12
   2225 	adox	$zero,%r15		# of=0
   2226 	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
   2227 	mov	%r12,-8*2($tptr)
   2228 	#jmp	.Lmulx4x_1st
   2229 
   2230 .align	32
   2231 .Lmulx4x_1st:
   2232 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2233 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
   2234 	adcx	%r14,%r10
   2235 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
   2236 	adcx	%rax,%r11
   2237 	mulx	2*8($aptr),%r12,%rax	# ...
   2238 	adcx	%r14,%r12
   2239 	mulx	3*8($aptr),%r13,%r14
   2240 	 .byte	0x67,0x67
   2241 	 mov	$mi,%rdx
   2242 	adcx	%rax,%r13
   2243 	adcx	$zero,%r14		# cf=0
   2244 	lea	4*8($aptr),$aptr
   2245 	lea	4*8($tptr),$tptr
   2246 
   2247 	adox	%r15,%r10
   2248 	mulx	0*16($nptr),%rax,%r15
   2249 	adcx	%rax,%r10
   2250 	adox	%r15,%r11
   2251 	mulx	1*16($nptr),%rax,%r15
   2252 	adcx	%rax,%r11
   2253 	adox	%r15,%r12
   2254 	mulx	2*16($nptr),%rax,%r15
   2255 	mov	%r10,-5*8($tptr)
   2256 	adcx	%rax,%r12
   2257 	mov	%r11,-4*8($tptr)
   2258 	adox	%r15,%r13
   2259 	mulx	3*16($nptr),%rax,%r15
   2260 	 mov	$bi,%rdx
   2261 	mov	%r12,-3*8($tptr)
   2262 	adcx	%rax,%r13
   2263 	adox	$zero,%r15
   2264 	lea	4*16($nptr),$nptr
   2265 	mov	%r13,-2*8($tptr)
   2266 
   2267 	dec	$bptr			# of=0, pass cf
   2268 	jnz	.Lmulx4x_1st
   2269 
   2270 	mov	8(%rsp),$num		# load -num
   2271 	movq	%xmm0,%rdx		# bp[1]
   2272 	adc	$zero,%r15		# modulo-scheduled
   2273 	lea	($aptr,$num),$aptr	# rewind $aptr
   2274 	add	%r15,%r14
   2275 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2276 	adc	$zero,$zero		# top-most carry
   2277 	mov	%r14,-1*8($tptr)
   2278 	jmp	.Lmulx4x_outer
   2279 
   2280 .align	32
   2281 .Lmulx4x_outer:
   2282 	mov	$zero,($tptr)		# save top-most carry
   2283 	lea	4*8($tptr,$num),$tptr	# rewind $tptr
   2284 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
   2285 	xor	$zero,$zero		# cf=0, of=0
   2286 	mov	%rdx,$bi
   2287 	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
   2288 	adox	-4*8($tptr),$mi		# +t[0]
   2289 	adcx	%r14,%r11
   2290 	mulx	2*8($aptr),%r15,%r13	# ...
   2291 	adox	-3*8($tptr),%r11
   2292 	adcx	%r15,%r12
   2293 	mulx	3*8($aptr),%rdx,%r14
   2294 	adox	-2*8($tptr),%r12
   2295 	adcx	%rdx,%r13
   2296 	lea	($nptr,$num,2),$nptr	# rewind $nptr
   2297 	lea	4*8($aptr),$aptr
   2298 	adox	-1*8($tptr),%r13
   2299 	adcx	$zero,%r14
   2300 	adox	$zero,%r14
   2301 
   2302 	.byte	0x67
   2303 	mov	$mi,%r15
   2304 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2305 
   2306 	movq	`0*$STRIDE/4-96`($bptr),%xmm0
   2307 	.byte	0x67,0x67
   2308 	mov	$mi,%rdx
   2309 	movq	`1*$STRIDE/4-96`($bptr),%xmm1
   2310 	.byte	0x67
   2311 	pand	%xmm4,%xmm0
   2312 	movq	`2*$STRIDE/4-96`($bptr),%xmm2
   2313 	.byte	0x67
   2314 	pand	%xmm5,%xmm1
   2315 	movq	`3*$STRIDE/4-96`($bptr),%xmm3
   2316 	add	\$$STRIDE,$bptr		# next &b[i]
   2317 	.byte	0x67
   2318 	pand	%xmm6,%xmm2
   2319 	por	%xmm1,%xmm0
   2320 	pand	%xmm7,%xmm3
   2321 	xor	$zero,$zero		# cf=0, of=0
   2322 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2323 
   2324 	mulx	0*16($nptr),%rax,%r10
   2325 	adcx	%rax,%r15		# discarded
   2326 	adox	%r11,%r10
   2327 	mulx	1*16($nptr),%rax,%r11
   2328 	adcx	%rax,%r10
   2329 	adox	%r12,%r11
   2330 	mulx	2*16($nptr),%rax,%r12
   2331 	adcx	%rax,%r11
   2332 	adox	%r13,%r12
   2333 	mulx	3*16($nptr),%rax,%r15
   2334 	 mov	$bi,%rdx
   2335 	 por	%xmm2,%xmm0
   2336 	mov	24+8(%rsp),$bptr	# counter value
   2337 	mov	%r10,-8*4($tptr)
   2338 	 por	%xmm3,%xmm0
   2339 	adcx	%rax,%r12
   2340 	mov	%r11,-8*3($tptr)
   2341 	adox	$zero,%r15		# of=0
   2342 	mov	%r12,-8*2($tptr)
   2343 	lea	4*16($nptr),$nptr
   2344 	jmp	.Lmulx4x_inner
   2345 
   2346 .align	32
   2347 .Lmulx4x_inner:
   2348 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
   2349 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2350 	adox	%r14,%r10
   2351 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
   2352 	adcx	0*8($tptr),%r10
   2353 	adox	%rax,%r11
   2354 	mulx	2*8($aptr),%r12,%rax	# ...
   2355 	adcx	1*8($tptr),%r11
   2356 	adox	%r14,%r12
   2357 	mulx	3*8($aptr),%r13,%r14
   2358 	 mov	$mi,%rdx
   2359 	adcx	2*8($tptr),%r12
   2360 	adox	%rax,%r13
   2361 	adcx	3*8($tptr),%r13
   2362 	adox	$zero,%r14		# of=0
   2363 	lea	4*8($aptr),$aptr
   2364 	lea	4*8($tptr),$tptr
   2365 	adcx	$zero,%r14		# cf=0
   2366 
   2367 	adox	%r15,%r10
   2368 	mulx	0*16($nptr),%rax,%r15
   2369 	adcx	%rax,%r10
   2370 	adox	%r15,%r11
   2371 	mulx	1*16($nptr),%rax,%r15
   2372 	adcx	%rax,%r11
   2373 	adox	%r15,%r12
   2374 	mulx	2*16($nptr),%rax,%r15
   2375 	mov	%r10,-5*8($tptr)
   2376 	adcx	%rax,%r12
   2377 	adox	%r15,%r13
   2378 	mov	%r11,-4*8($tptr)
   2379 	mulx	3*16($nptr),%rax,%r15
   2380 	 mov	$bi,%rdx
   2381 	lea	4*16($nptr),$nptr
   2382 	mov	%r12,-3*8($tptr)
   2383 	adcx	%rax,%r13
   2384 	adox	$zero,%r15
   2385 	mov	%r13,-2*8($tptr)
   2386 
   2387 	dec	$bptr			# of=0, pass cf
   2388 	jnz	.Lmulx4x_inner
   2389 
   2390 	mov	0+8(%rsp),$num		# load -num
   2391 	movq	%xmm0,%rdx		# bp[i+1]
   2392 	adc	$zero,%r15		# modulo-scheduled
   2393 	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
   2394 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2395 	mov	16+8(%rsp),%r10
   2396 	adc	%r15,%r14
   2397 	lea	($aptr,$num),$aptr	# rewind $aptr
   2398 	adc	$zero,$zero		# top-most carry
   2399 	mov	%r14,-1*8($tptr)
   2400 
   2401 	cmp	%r10,$bptr
   2402 	jb	.Lmulx4x_outer
   2403 
   2404 	mov	-16($nptr),%r10
   2405 	xor	%r15,%r15
   2406 	sub	%r14,%r10		# compare top-most words
   2407 	adc	%r15,%r15
   2408 	or	%r15,$zero
   2409 	xor	\$1,$zero
   2410 	lea	($tptr,$num),%rdi	# rewind $tptr
   2411 	lea	($nptr,$num,2),$nptr	# rewind $nptr
   2412 	.byte	0x67,0x67
   2413 	sar	\$3+2,$num		# cf=0
   2414 	lea	($nptr,$zero,8),%rbp
   2415 	mov	56+8(%rsp),%rdx		# restore rp
   2416 	mov	$num,%rcx
   2417 	jmp	.Lsqrx4x_sub		# common post-condition
   2418 .size	mulx4x_internal,.-mulx4x_internal
   2419 ___
   2420 }{
   2422 ######################################################################
   2423 # void bn_power5(
   2424 my $rptr="%rdi";	# BN_ULONG *rptr,
   2425 my $aptr="%rsi";	# const BN_ULONG *aptr,
   2426 my $bptr="%rdx";	# const void *table,
   2427 my $nptr="%rcx";	# const BN_ULONG *nptr,
   2428 my $n0  ="%r8";		# const BN_ULONG *n0);
   2429 my $num ="%r9";		# int num, has to be divisible by 8
   2430 			# int pwr);
   2431 
   2432 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
   2433 my @A0=("%r10","%r11");
   2434 my @A1=("%r12","%r13");
   2435 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
   2436 
   2437 $code.=<<___;
   2438 .type	bn_powerx5,\@function,6
   2439 .align	32
   2440 bn_powerx5:
   2441 .Lpowerx5_enter:
   2442 	.byte	0x67
   2443 	mov	%rsp,%rax
   2444 	push	%rbx
   2445 	push	%rbp
   2446 	push	%r12
   2447 	push	%r13
   2448 	push	%r14
   2449 	push	%r15
   2450 ___
   2451 $code.=<<___ if ($win64);
   2452 	lea	-0x28(%rsp),%rsp
   2453 	movaps	%xmm6,(%rsp)
   2454 	movaps	%xmm7,0x10(%rsp)
   2455 ___
   2456 $code.=<<___;
   2457 	.byte	0x67
   2458 	mov	${num}d,%r10d
   2459 	shl	\$3,${num}d		# convert $num to bytes
   2460 	shl	\$3+2,%r10d		# 4*$num
   2461 	neg	$num
   2462 	mov	($n0),$n0		# *n0
   2463 
   2464 	##############################################################
   2465 	# ensure that stack frame doesn't alias with $aptr+4*$num
   2466 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
   2467 	# (see bn_exp.c). this is done to allow memory disambiguation
   2468 	# logic do its magic.
   2469 	#
   2470 	lea	-64(%rsp,$num,2),%r11
   2471 	sub	$aptr,%r11
   2472 	and	\$4095,%r11
   2473 	cmp	%r11,%r10
   2474 	jb	.Lpwrx_sp_alt
   2475 	sub	%r11,%rsp		# align with $aptr
   2476 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   2477 	jmp	.Lpwrx_sp_done
   2478 
   2479 .align	32
   2480 .Lpwrx_sp_alt:
   2481 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
   2482 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   2483 	sub	%r10,%r11
   2484 	mov	\$0,%r10
   2485 	cmovc	%r10,%r11
   2486 	sub	%r11,%rsp
   2487 .Lpwrx_sp_done:
   2488 	and	\$-64,%rsp
   2489 	mov	$num,%r10	
   2490 	neg	$num
   2491 
   2492 	##############################################################
   2493 	# Stack layout
   2494 	#
   2495 	# +0	saved $num, used in reduction section
   2496 	# +8	&t[2*$num], used in reduction section
   2497 	# +16	intermediate carry bit
   2498 	# +24	top-most carry bit, used in reduction section
   2499 	# +32	saved *n0
   2500 	# +40	saved %rsp
   2501 	# +48	t[2*$num]
   2502 	#
   2503 	pxor	%xmm0,%xmm0
   2504 	movq	$rptr,%xmm1		# save $rptr
   2505 	movq	$nptr,%xmm2		# save $nptr
   2506 	movq	%r10, %xmm3		# -$num
   2507 	movq	$bptr,%xmm4
   2508 	mov	$n0,  32(%rsp)
   2509 	mov	%rax, 40(%rsp)		# save original %rsp
   2510 .Lpowerx5_body:
   2511 
   2512 	call	__bn_sqrx8x_internal
   2513 	call	__bn_sqrx8x_internal
   2514 	call	__bn_sqrx8x_internal
   2515 	call	__bn_sqrx8x_internal
   2516 	call	__bn_sqrx8x_internal
   2517 
   2518 	mov	%r10,$num		# -num
   2519 	mov	$aptr,$rptr
   2520 	movq	%xmm2,$nptr
   2521 	movq	%xmm4,$bptr
   2522 	mov	40(%rsp),%rax
   2523 
   2524 	call	mulx4x_internal
   2525 
   2526 	mov	40(%rsp),%rsi		# restore %rsp
   2527 	mov	\$1,%rax
   2528 ___
   2529 $code.=<<___ if ($win64);
   2530 	movaps	-88(%rsi),%xmm6
   2531 	movaps	-72(%rsi),%xmm7
   2532 ___
   2533 $code.=<<___;
   2534 	mov	-48(%rsi),%r15
   2535 	mov	-40(%rsi),%r14
   2536 	mov	-32(%rsi),%r13
   2537 	mov	-24(%rsi),%r12
   2538 	mov	-16(%rsi),%rbp
   2539 	mov	-8(%rsi),%rbx
   2540 	lea	(%rsi),%rsp
   2541 .Lpowerx5_epilogue:
   2542 	ret
   2543 .size	bn_powerx5,.-bn_powerx5
   2544 
   2545 .globl	bn_sqrx8x_internal
   2546 .hidden	bn_sqrx8x_internal
   2547 .type	bn_sqrx8x_internal,\@abi-omnipotent
   2548 .align	32
   2549 bn_sqrx8x_internal:
   2550 __bn_sqrx8x_internal:
   2551 	##################################################################
   2552 	# Squaring part:
   2553 	#
   2554 	# a) multiply-n-add everything but a[i]*a[i];
   2555 	# b) shift result of a) by 1 to the left and accumulate
   2556 	#    a[i]*a[i] products;
   2557 	#
   2558 	##################################################################
   2559 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2560 	#                                                     a[1]a[0]
   2561 	#                                                 a[2]a[0]
   2562 	#                                             a[3]a[0]
   2563 	#                                             a[2]a[1]
   2564 	#                                         a[3]a[1]
   2565 	#                                     a[3]a[2]
   2566 	#
   2567 	#                                         a[4]a[0]
   2568 	#                                     a[5]a[0]
   2569 	#                                 a[6]a[0]
   2570 	#                             a[7]a[0]
   2571 	#                                     a[4]a[1]
   2572 	#                                 a[5]a[1]
   2573 	#                             a[6]a[1]
   2574 	#                         a[7]a[1]
   2575 	#                                 a[4]a[2]
   2576 	#                             a[5]a[2]
   2577 	#                         a[6]a[2]
   2578 	#                     a[7]a[2]
   2579 	#                             a[4]a[3]
   2580 	#                         a[5]a[3]
   2581 	#                     a[6]a[3]
   2582 	#                 a[7]a[3]
   2583 	#
   2584 	#                     a[5]a[4]
   2585 	#                 a[6]a[4]
   2586 	#             a[7]a[4]
   2587 	#             a[6]a[5]
   2588 	#         a[7]a[5]
   2589 	#     a[7]a[6]
   2590 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2591 ___
   2592 {
   2593 my ($zero,$carry)=("%rbp","%rcx");
   2594 my $aaptr=$zero;
   2595 $code.=<<___;
   2596 	lea	48+8(%rsp),$tptr
   2597 	lea	($aptr,$num),$aaptr
   2598 	mov	$num,0+8(%rsp)			# save $num
   2599 	mov	$aaptr,8+8(%rsp)		# save end of $aptr
   2600 	jmp	.Lsqr8x_zero_start
   2601 
   2602 .align	32
   2603 .byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
   2604 .Lsqrx8x_zero:
   2605 	.byte	0x3e
   2606 	movdqa	%xmm0,0*8($tptr)
   2607 	movdqa	%xmm0,2*8($tptr)
   2608 	movdqa	%xmm0,4*8($tptr)
   2609 	movdqa	%xmm0,6*8($tptr)
   2610 .Lsqr8x_zero_start:			# aligned at 32
   2611 	movdqa	%xmm0,8*8($tptr)
   2612 	movdqa	%xmm0,10*8($tptr)
   2613 	movdqa	%xmm0,12*8($tptr)
   2614 	movdqa	%xmm0,14*8($tptr)
   2615 	lea	16*8($tptr),$tptr
   2616 	sub	\$64,$num
   2617 	jnz	.Lsqrx8x_zero
   2618 
   2619 	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
   2620 	#xor	%r9,%r9			# t[1], ex-$num, zero already
   2621 	xor	%r10,%r10
   2622 	xor	%r11,%r11
   2623 	xor	%r12,%r12
   2624 	xor	%r13,%r13
   2625 	xor	%r14,%r14
   2626 	xor	%r15,%r15
   2627 	lea	48+8(%rsp),$tptr
   2628 	xor	$zero,$zero		# cf=0, cf=0
   2629 	jmp	.Lsqrx8x_outer_loop
   2630 
   2631 .align	32
   2632 .Lsqrx8x_outer_loop:
   2633 	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
   2634 	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
   2635 	adox	%rax,%r10
   2636 	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
   2637 	adcx	%r10,%r9
   2638 	adox	%rax,%r11
   2639 	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
   2640 	adcx	%r11,%r10
   2641 	adox	%rax,%r12
   2642 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
   2643 	adcx	%r12,%r11
   2644 	adox	%rax,%r13
   2645 	mulx	5*8($aptr),%r12,%rax
   2646 	adcx	%r13,%r12
   2647 	adox	%rax,%r14
   2648 	mulx	6*8($aptr),%r13,%rax
   2649 	adcx	%r14,%r13
   2650 	adox	%r15,%rax
   2651 	mulx	7*8($aptr),%r14,%r15
   2652 	 mov	1*8($aptr),%rdx		# a[1]
   2653 	adcx	%rax,%r14
   2654 	adox	$zero,%r15
   2655 	adc	8*8($tptr),%r15
   2656 	mov	%r8,1*8($tptr)		# t[1]
   2657 	mov	%r9,2*8($tptr)		# t[2]
   2658 	sbb	$carry,$carry		# mov %cf,$carry
   2659 	xor	$zero,$zero		# cf=0, of=0
   2660 
   2661 
   2662 	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
   2663 	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
   2664 	adcx	%r10,%r8
   2665 	adox	%rbx,%r9
   2666 	mulx	4*8($aptr),%r10,%rbx	# ...
   2667 	adcx	%r11,%r9
   2668 	adox	%rax,%r10
   2669 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
   2670 	adcx	%r12,%r10
   2671 	adox	%rbx,%r11
   2672 	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
   2673 	adcx	%r13,%r11
   2674 	adox	%r14,%r12
   2675 	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
   2676 	 mov	2*8($aptr),%rdx		# a[2]
   2677 	adcx	%rax,%r12
   2678 	adox	%rbx,%r13
   2679 	adcx	%r15,%r13
   2680 	adox	$zero,%r14		# of=0
   2681 	adcx	$zero,%r14		# cf=0
   2682 
   2683 	mov	%r8,3*8($tptr)		# t[3]
   2684 	mov	%r9,4*8($tptr)		# t[4]
   2685 
   2686 	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
   2687 	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
   2688 	adcx	%r10,%r8
   2689 	adox	%rbx,%r9
   2690 	mulx	5*8($aptr),%r10,%rbx	# ...
   2691 	adcx	%r11,%r9
   2692 	adox	%rax,%r10
   2693 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
   2694 	adcx	%r12,%r10
   2695 	adox	%r13,%r11
   2696 	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
   2697 	.byte	0x3e
   2698 	 mov	3*8($aptr),%rdx		# a[3]
   2699 	adcx	%rbx,%r11
   2700 	adox	%rax,%r12
   2701 	adcx	%r14,%r12
   2702 	mov	%r8,5*8($tptr)		# t[5]
   2703 	mov	%r9,6*8($tptr)		# t[6]
   2704 	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
   2705 	adox	$zero,%r13		# of=0
   2706 	adcx	$zero,%r13		# cf=0
   2707 
   2708 	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
   2709 	adcx	%r10,%r8
   2710 	adox	%rax,%r9
   2711 	mulx	6*8($aptr),%r10,%rax	# ...
   2712 	adcx	%r11,%r9
   2713 	adox	%r12,%r10
   2714 	mulx	7*8($aptr),%r11,%r12
   2715 	 mov	4*8($aptr),%rdx		# a[4]
   2716 	 mov	5*8($aptr),%r14		# a[5]
   2717 	adcx	%rbx,%r10
   2718 	adox	%rax,%r11
   2719 	 mov	6*8($aptr),%r15		# a[6]
   2720 	adcx	%r13,%r11
   2721 	adox	$zero,%r12		# of=0
   2722 	adcx	$zero,%r12		# cf=0
   2723 
   2724 	mov	%r8,7*8($tptr)		# t[7]
   2725 	mov	%r9,8*8($tptr)		# t[8]
   2726 
   2727 	mulx	%r14,%r9,%rax		# a[5]*a[4]
   2728 	 mov	7*8($aptr),%r8		# a[7]
   2729 	adcx	%r10,%r9
   2730 	mulx	%r15,%r10,%rbx		# a[6]*a[4]
   2731 	adox	%rax,%r10
   2732 	adcx	%r11,%r10
   2733 	mulx	%r8,%r11,%rax		# a[7]*a[4]
   2734 	 mov	%r14,%rdx		# a[5]
   2735 	adox	%rbx,%r11
   2736 	adcx	%r12,%r11
   2737 	#adox	$zero,%rax		# of=0
   2738 	adcx	$zero,%rax		# cf=0
   2739 
   2740 	mulx	%r15,%r14,%rbx		# a[6]*a[5]
   2741 	mulx	%r8,%r12,%r13		# a[7]*a[5]
   2742 	 mov	%r15,%rdx		# a[6]
   2743 	 lea	8*8($aptr),$aptr
   2744 	adcx	%r14,%r11
   2745 	adox	%rbx,%r12
   2746 	adcx	%rax,%r12
   2747 	adox	$zero,%r13
   2748 
   2749 	.byte	0x67,0x67
   2750 	mulx	%r8,%r8,%r14		# a[7]*a[6]
   2751 	adcx	%r8,%r13
   2752 	adcx	$zero,%r14
   2753 
   2754 	cmp	8+8(%rsp),$aptr
   2755 	je	.Lsqrx8x_outer_break
   2756 
   2757 	neg	$carry			# mov $carry,%cf
   2758 	mov	\$-8,%rcx
   2759 	mov	$zero,%r15
   2760 	mov	8*8($tptr),%r8
   2761 	adcx	9*8($tptr),%r9		# +=t[9]
   2762 	adcx	10*8($tptr),%r10	# ...
   2763 	adcx	11*8($tptr),%r11
   2764 	adc	12*8($tptr),%r12
   2765 	adc	13*8($tptr),%r13
   2766 	adc	14*8($tptr),%r14
   2767 	adc	15*8($tptr),%r15
   2768 	lea	($aptr),$aaptr
   2769 	lea	2*64($tptr),$tptr
   2770 	sbb	%rax,%rax		# mov %cf,$carry
   2771 
   2772 	mov	-64($aptr),%rdx		# a[0]
   2773 	mov	%rax,16+8(%rsp)		# offload $carry
   2774 	mov	$tptr,24+8(%rsp)
   2775 
   2776 	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
   2777 	xor	%eax,%eax		# cf=0, of=0
   2778 	jmp	.Lsqrx8x_loop
   2779 
   2780 .align	32
   2781 .Lsqrx8x_loop:
   2782 	mov	%r8,%rbx
   2783 	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
   2784 	adcx	%rax,%rbx		# +=t[8]
   2785 	adox	%r9,%r8
   2786 
   2787 	mulx	1*8($aaptr),%rax,%r9	# ...
   2788 	adcx	%rax,%r8
   2789 	adox	%r10,%r9
   2790 
   2791 	mulx	2*8($aaptr),%rax,%r10
   2792 	adcx	%rax,%r9
   2793 	adox	%r11,%r10
   2794 
   2795 	mulx	3*8($aaptr),%rax,%r11
   2796 	adcx	%rax,%r10
   2797 	adox	%r12,%r11
   2798 
   2799 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
   2800 	adcx	%rax,%r11
   2801 	adox	%r13,%r12
   2802 
   2803 	mulx	5*8($aaptr),%rax,%r13
   2804 	adcx	%rax,%r12
   2805 	adox	%r14,%r13
   2806 
   2807 	mulx	6*8($aaptr),%rax,%r14
   2808 	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
   2809 	 mov	\$0,%ebx
   2810 	adcx	%rax,%r13
   2811 	adox	%r15,%r14
   2812 
   2813 	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
   2814 	 mov	8($aptr,%rcx,8),%rdx	# a[i]
   2815 	adcx	%rax,%r14
   2816 	adox	%rbx,%r15		# %rbx is 0, of=0
   2817 	adcx	%rbx,%r15		# cf=0
   2818 
   2819 	.byte	0x67
   2820 	inc	%rcx			# of=0
   2821 	jnz	.Lsqrx8x_loop
   2822 
   2823 	lea	8*8($aaptr),$aaptr
   2824 	mov	\$-8,%rcx
   2825 	cmp	8+8(%rsp),$aaptr	# done?
   2826 	je	.Lsqrx8x_break
   2827 
   2828 	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
   2829 	.byte	0x66
   2830 	mov	-64($aptr),%rdx
   2831 	adcx	0*8($tptr),%r8
   2832 	adcx	1*8($tptr),%r9
   2833 	adc	2*8($tptr),%r10
   2834 	adc	3*8($tptr),%r11
   2835 	adc	4*8($tptr),%r12
   2836 	adc	5*8($tptr),%r13
   2837 	adc	6*8($tptr),%r14
   2838 	adc	7*8($tptr),%r15
   2839 	lea	8*8($tptr),$tptr
   2840 	.byte	0x67
   2841 	sbb	%rax,%rax		# mov %cf,%rax
   2842 	xor	%ebx,%ebx		# cf=0, of=0
   2843 	mov	%rax,16+8(%rsp)		# offload carry
   2844 	jmp	.Lsqrx8x_loop
   2845 
   2846 .align	32
   2847 .Lsqrx8x_break:
   2848 	sub	16+8(%rsp),%r8		# consume last carry
   2849 	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
   2850 	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
   2851 	xor	%ebp,%ebp		# xor	$zero,$zero
   2852 	mov	%r8,0*8($tptr)
   2853 	cmp	$carry,$tptr		# cf=0, of=0
   2854 	je	.Lsqrx8x_outer_loop
   2855 
   2856 	mov	%r9,1*8($tptr)
   2857 	 mov	1*8($carry),%r9
   2858 	mov	%r10,2*8($tptr)
   2859 	 mov	2*8($carry),%r10
   2860 	mov	%r11,3*8($tptr)
   2861 	 mov	3*8($carry),%r11
   2862 	mov	%r12,4*8($tptr)
   2863 	 mov	4*8($carry),%r12
   2864 	mov	%r13,5*8($tptr)
   2865 	 mov	5*8($carry),%r13
   2866 	mov	%r14,6*8($tptr)
   2867 	 mov	6*8($carry),%r14
   2868 	mov	%r15,7*8($tptr)
   2869 	 mov	7*8($carry),%r15
   2870 	mov	$carry,$tptr
   2871 	jmp	.Lsqrx8x_outer_loop
   2872 
   2873 .align	32
   2874 .Lsqrx8x_outer_break:
   2875 	mov	%r9,9*8($tptr)		# t[9]
   2876 	 movq	%xmm3,%rcx		# -$num
   2877 	mov	%r10,10*8($tptr)	# ...
   2878 	mov	%r11,11*8($tptr)
   2879 	mov	%r12,12*8($tptr)
   2880 	mov	%r13,13*8($tptr)
   2881 	mov	%r14,14*8($tptr)
   2882 ___
   2883 }{
   2885 my $i="%rcx";
   2886 $code.=<<___;
   2887 	lea	48+8(%rsp),$tptr
   2888 	mov	($aptr,$i),%rdx		# a[0]
   2889 
   2890 	mov	8($tptr),$A0[1]		# t[1]
   2891 	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
   2892 	mov	0+8(%rsp),$num		# restore $num
   2893 	adox	$A0[1],$A0[1]
   2894 	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
   2895 	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
   2896 	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
   2897 
   2898 .align	32
   2899 .Lsqrx4x_shift_n_add:
   2900 	mulx	%rdx,%rax,%rbx
   2901 	 adox	$A1[0],$A1[0]
   2902 	adcx	$A0[0],%rax
   2903 	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
   2904 	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
   2905 	 adox	$A1[1],$A1[1]
   2906 	adcx	$A0[1],%rbx
   2907 	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
   2908 	mov	%rax,0($tptr)
   2909 	mov	%rbx,8($tptr)
   2910 
   2911 	mulx	%rdx,%rax,%rbx
   2912 	 adox	$A0[0],$A0[0]
   2913 	adcx	$A1[0],%rax
   2914 	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
   2915 	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
   2916 	 adox	$A0[1],$A0[1]
   2917 	adcx	$A1[1],%rbx
   2918 	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
   2919 	mov	%rax,16($tptr)
   2920 	mov	%rbx,24($tptr)
   2921 
   2922 	mulx	%rdx,%rax,%rbx
   2923 	 adox	$A1[0],$A1[0]
   2924 	adcx	$A0[0],%rax
   2925 	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
   2926 	 lea	32($i),$i
   2927 	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
   2928 	 adox	$A1[1],$A1[1]
   2929 	adcx	$A0[1],%rbx
   2930 	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
   2931 	mov	%rax,32($tptr)
   2932 	mov	%rbx,40($tptr)
   2933 
   2934 	mulx	%rdx,%rax,%rbx
   2935 	 adox	$A0[0],$A0[0]
   2936 	adcx	$A1[0],%rax
   2937 	jrcxz	.Lsqrx4x_shift_n_add_break
   2938 	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
   2939 	 adox	$A0[1],$A0[1]
   2940 	adcx	$A1[1],%rbx
   2941 	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
   2942 	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
   2943 	mov	%rax,48($tptr)
   2944 	mov	%rbx,56($tptr)
   2945 	lea	64($tptr),$tptr
   2946 	nop
   2947 	jmp	.Lsqrx4x_shift_n_add
   2948 
   2949 .align	32
   2950 .Lsqrx4x_shift_n_add_break:
   2951 	adcx	$A1[1],%rbx
   2952 	mov	%rax,48($tptr)
   2953 	mov	%rbx,56($tptr)
   2954 	lea	64($tptr),$tptr		# end of t[] buffer
   2955 ___
   2956 }
   2958 ######################################################################
   2959 # Montgomery reduction part, "word-by-word" algorithm.
   2960 #
   2961 # This new path is inspired by multiple submissions from Intel, by
   2962 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   2963 # Vinodh Gopal...
   2964 {
   2965 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
   2966 
   2967 $code.=<<___;
   2968 	movq	%xmm2,$nptr
   2969 sqrx8x_reduction:
   2970 	xor	%eax,%eax		# initial top-most carry bit
   2971 	mov	32+8(%rsp),%rbx		# n0
   2972 	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
   2973 	lea	-128($nptr,$num,2),%rcx	# end of n[]
   2974 	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
   2975 	mov	%rcx, 0+8(%rsp)		# save end of n[]
   2976 	mov	$tptr,8+8(%rsp)		# save end of t[]
   2977 
   2978 	lea	48+8(%rsp),$tptr		# initial t[] window
   2979 	jmp	.Lsqrx8x_reduction_loop
   2980 
   2981 .align	32
   2982 .Lsqrx8x_reduction_loop:
   2983 	mov	8*1($tptr),%r9
   2984 	mov	8*2($tptr),%r10
   2985 	mov	8*3($tptr),%r11
   2986 	mov	8*4($tptr),%r12
   2987 	mov	%rdx,%r8
   2988 	imulq	%rbx,%rdx		# n0*a[i]
   2989 	mov	8*5($tptr),%r13
   2990 	mov	8*6($tptr),%r14
   2991 	mov	8*7($tptr),%r15
   2992 	mov	%rax,24+8(%rsp)		# store top-most carry bit
   2993 
   2994 	lea	8*8($tptr),$tptr
   2995 	xor	$carry,$carry		# cf=0,of=0
   2996 	mov	\$-8,%rcx
   2997 	jmp	.Lsqrx8x_reduce
   2998 
   2999 .align	32
   3000 .Lsqrx8x_reduce:
   3001 	mov	%r8, %rbx
   3002 	mulx	16*0($nptr),%rax,%r8	# n[0]
   3003 	adcx	%rbx,%rax		# discarded
   3004 	adox	%r9,%r8
   3005 
   3006 	mulx	16*1($nptr),%rbx,%r9	# n[1]
   3007 	adcx	%rbx,%r8
   3008 	adox	%r10,%r9
   3009 
   3010 	mulx	16*2($nptr),%rbx,%r10
   3011 	adcx	%rbx,%r9
   3012 	adox	%r11,%r10
   3013 
   3014 	mulx	16*3($nptr),%rbx,%r11
   3015 	adcx	%rbx,%r10
   3016 	adox	%r12,%r11
   3017 
   3018 	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
   3019 	 mov	%rdx,%rax
   3020 	 mov	%r8,%rdx
   3021 	adcx	%rbx,%r11
   3022 	adox	%r13,%r12
   3023 
   3024 	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
   3025 	 mov	%rax,%rdx
   3026 	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
   3027 
   3028 	mulx	16*5($nptr),%rax,%r13
   3029 	adcx	%rax,%r12
   3030 	adox	%r14,%r13
   3031 
   3032 	mulx	16*6($nptr),%rax,%r14
   3033 	adcx	%rax,%r13
   3034 	adox	%r15,%r14
   3035 
   3036 	mulx	16*7($nptr),%rax,%r15
   3037 	 mov	%rbx,%rdx
   3038 	adcx	%rax,%r14
   3039 	adox	$carry,%r15		# $carry is 0
   3040 	adcx	$carry,%r15		# cf=0
   3041 
   3042 	.byte	0x67,0x67,0x67
   3043 	inc	%rcx			# of=0
   3044 	jnz	.Lsqrx8x_reduce
   3045 
   3046 	mov	$carry,%rax		# xor	%rax,%rax
   3047 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3048 	jae	.Lsqrx8x_no_tail
   3049 
   3050 	mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3051 	add	8*0($tptr),%r8
   3052 	lea	16*8($nptr),$nptr
   3053 	mov	\$-8,%rcx
   3054 	adcx	8*1($tptr),%r9
   3055 	adcx	8*2($tptr),%r10
   3056 	adc	8*3($tptr),%r11
   3057 	adc	8*4($tptr),%r12
   3058 	adc	8*5($tptr),%r13
   3059 	adc	8*6($tptr),%r14
   3060 	adc	8*7($tptr),%r15
   3061 	lea	8*8($tptr),$tptr
   3062 	sbb	%rax,%rax		# top carry
   3063 
   3064 	xor	$carry,$carry		# of=0, cf=0
   3065 	mov	%rax,16+8(%rsp)
   3066 	jmp	.Lsqrx8x_tail
   3067 
   3068 .align	32
   3069 .Lsqrx8x_tail:
   3070 	mov	%r8,%rbx
   3071 	mulx	16*0($nptr),%rax,%r8
   3072 	adcx	%rax,%rbx
   3073 	adox	%r9,%r8
   3074 
   3075 	mulx	16*1($nptr),%rax,%r9
   3076 	adcx	%rax,%r8
   3077 	adox	%r10,%r9
   3078 
   3079 	mulx	16*2($nptr),%rax,%r10
   3080 	adcx	%rax,%r9
   3081 	adox	%r11,%r10
   3082 
   3083 	mulx	16*3($nptr),%rax,%r11
   3084 	adcx	%rax,%r10
   3085 	adox	%r12,%r11
   3086 
   3087 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
   3088 	adcx	%rax,%r11
   3089 	adox	%r13,%r12
   3090 
   3091 	mulx	16*5($nptr),%rax,%r13
   3092 	adcx	%rax,%r12
   3093 	adox	%r14,%r13
   3094 
   3095 	mulx	16*6($nptr),%rax,%r14
   3096 	adcx	%rax,%r13
   3097 	adox	%r15,%r14
   3098 
   3099 	mulx	16*7($nptr),%rax,%r15
   3100 	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
   3101 	adcx	%rax,%r14
   3102 	adox	$carry,%r15
   3103 	 mov	%rbx,($tptr,%rcx,8)	# save result
   3104 	 mov	%r8,%rbx
   3105 	adcx	$carry,%r15		# cf=0
   3106 
   3107 	inc	%rcx			# of=0
   3108 	jnz	.Lsqrx8x_tail
   3109 
   3110 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3111 	jae	.Lsqrx8x_tail_done	# break out of loop
   3112 
   3113 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3114 	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3115 	 lea	16*8($nptr),$nptr
   3116 	adc	8*0($tptr),%r8
   3117 	adc	8*1($tptr),%r9
   3118 	adc	8*2($tptr),%r10
   3119 	adc	8*3($tptr),%r11
   3120 	adc	8*4($tptr),%r12
   3121 	adc	8*5($tptr),%r13
   3122 	adc	8*6($tptr),%r14
   3123 	adc	8*7($tptr),%r15
   3124 	lea	8*8($tptr),$tptr
   3125 	sbb	%rax,%rax
   3126 	sub	\$8,%rcx		# mov	\$-8,%rcx
   3127 
   3128 	xor	$carry,$carry		# of=0, cf=0
   3129 	mov	%rax,16+8(%rsp)
   3130 	jmp	.Lsqrx8x_tail
   3131 
   3132 .align	32
   3133 .Lsqrx8x_tail_done:
   3134 	add	24+8(%rsp),%r8		# can this overflow?
   3135 	adc	\$0,%r9
   3136 	adc	\$0,%r10
   3137 	adc	\$0,%r11
   3138 	adc	\$0,%r12
   3139 	adc	\$0,%r13
   3140 	adc	\$0,%r14
   3141 	adc	\$0,%r15		# can't overflow, because we
   3142 					# started with "overhung" part
   3143 					# of multiplication
   3144 	mov	$carry,%rax		# xor	%rax,%rax
   3145 
   3146 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3147 .Lsqrx8x_no_tail:			# %cf is 0 if jumped here
   3148 	adc	8*0($tptr),%r8
   3149 	 movq	%xmm3,%rcx
   3150 	adc	8*1($tptr),%r9
   3151 	 mov	16*7($nptr),$carry
   3152 	 movq	%xmm2,$nptr		# restore $nptr
   3153 	adc	8*2($tptr),%r10
   3154 	adc	8*3($tptr),%r11
   3155 	adc	8*4($tptr),%r12
   3156 	adc	8*5($tptr),%r13
   3157 	adc	8*6($tptr),%r14
   3158 	adc	8*7($tptr),%r15
   3159 	adc	%rax,%rax		# top-most carry
   3160 
   3161 	mov	32+8(%rsp),%rbx		# n0
   3162 	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
   3163 
   3164 	mov	%r8,8*0($tptr)		# store top 512 bits
   3165 	 lea	8*8($tptr),%r8		# borrow %r8
   3166 	mov	%r9,8*1($tptr)
   3167 	mov	%r10,8*2($tptr)
   3168 	mov	%r11,8*3($tptr)
   3169 	mov	%r12,8*4($tptr)
   3170 	mov	%r13,8*5($tptr)
   3171 	mov	%r14,8*6($tptr)
   3172 	mov	%r15,8*7($tptr)
   3173 
   3174 	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
   3175 	cmp	8+8(%rsp),%r8		# end of t[]?
   3176 	jb	.Lsqrx8x_reduction_loop
   3177 ___
   3178 }
   3180 ##############################################################
   3181 # Post-condition, 4x unrolled
   3182 #
   3183 {
   3184 my ($rptr,$nptr)=("%rdx","%rbp");
   3185 my @ri=map("%r$_",(10..13));
   3186 my @ni=map("%r$_",(14..15));
   3187 $code.=<<___;
   3188 	xor	%ebx,%ebx
   3189 	sub	%r15,%rsi		# compare top-most words
   3190 	adc	%rbx,%rbx
   3191 	mov	%rcx,%r10		# -$num
   3192 	or	%rbx,%rax
   3193 	mov	%rcx,%r9		# -$num
   3194 	xor	\$1,%rax
   3195 	sar	\$3+2,%rcx		# cf=0
   3196 	#lea	48+8(%rsp,%r9),$tptr
   3197 	lea	($nptr,%rax,8),$nptr
   3198 	movq	%xmm1,$rptr		# restore $rptr
   3199 	movq	%xmm1,$aptr		# prepare for back-to-back call
   3200 	jmp	.Lsqrx4x_sub
   3201 
   3202 .align	32
   3203 .Lsqrx4x_sub:
   3204 	.byte	0x66
   3205 	mov	8*0($tptr),%r12
   3206 	mov	8*1($tptr),%r13
   3207 	sbb	16*0($nptr),%r12
   3208 	mov	8*2($tptr),%r14
   3209 	sbb	16*1($nptr),%r13
   3210 	mov	8*3($tptr),%r15
   3211 	lea	8*4($tptr),$tptr
   3212 	sbb	16*2($nptr),%r14
   3213 	mov	%r12,8*0($rptr)
   3214 	sbb	16*3($nptr),%r15
   3215 	lea	16*4($nptr),$nptr
   3216 	mov	%r13,8*1($rptr)
   3217 	mov	%r14,8*2($rptr)
   3218 	mov	%r15,8*3($rptr)
   3219 	lea	8*4($rptr),$rptr
   3220 
   3221 	inc	%rcx
   3222 	jnz	.Lsqrx4x_sub
   3223 ___
   3224 }
   3225 $code.=<<___;
   3226 	neg	%r9			# restore $num
   3227 
   3228 	ret
   3229 .size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
   3230 ___
   3231 }}}
   3232 {
   3233 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
   3234 				("%rdi","%esi","%rdx","%ecx");  # Unix order
   3235 my $out=$inp;
   3236 my $STRIDE=2**5*8;
   3237 my $N=$STRIDE/4;
   3238 
   3239 $code.=<<___;
   3240 .globl	bn_scatter5
   3241 .type	bn_scatter5,\@abi-omnipotent
   3242 .align	16
   3243 bn_scatter5:
   3244 	cmp	\$0, $num
   3245 	jz	.Lscatter_epilogue
   3246 	lea	($tbl,$idx,8),$tbl
   3247 .Lscatter:
   3248 	mov	($inp),%rax
   3249 	lea	8($inp),$inp
   3250 	mov	%rax,($tbl)
   3251 	lea	32*8($tbl),$tbl
   3252 	sub	\$1,$num
   3253 	jnz	.Lscatter
   3254 .Lscatter_epilogue:
   3255 	ret
   3256 .size	bn_scatter5,.-bn_scatter5
   3257 
   3258 .globl	bn_gather5
   3259 .type	bn_gather5,\@abi-omnipotent
   3260 .align	16
   3261 bn_gather5:
   3262 ___
   3263 $code.=<<___ if ($win64);
   3264 .LSEH_begin_bn_gather5:
   3265 	# I can't trust assembler to use specific encoding:-(
   3266 	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
   3267 	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
   3268 	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
   3269 ___
   3270 $code.=<<___;
   3271 	mov	$idx,%r11d
   3272 	shr	\$`log($N/8)/log(2)`,$idx
   3273 	and	\$`$N/8-1`,%r11
   3274 	not	$idx
   3275 	lea	.Lmagic_masks(%rip),%rax
   3276 	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
   3277 	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
   3278 	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
   3279 	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
   3280 	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
   3281 	movq	24(%rax,$idx,8),%xmm7
   3282 	jmp	.Lgather
   3283 .align	16
   3284 .Lgather:
   3285 	movq	`0*$STRIDE/4-128`($tbl),%xmm0
   3286 	movq	`1*$STRIDE/4-128`($tbl),%xmm1
   3287 	pand	%xmm4,%xmm0
   3288 	movq	`2*$STRIDE/4-128`($tbl),%xmm2
   3289 	pand	%xmm5,%xmm1
   3290 	movq	`3*$STRIDE/4-128`($tbl),%xmm3
   3291 	pand	%xmm6,%xmm2
   3292 	por	%xmm1,%xmm0
   3293 	pand	%xmm7,%xmm3
   3294 	.byte	0x67,0x67
   3295 	por	%xmm2,%xmm0
   3296 	lea	$STRIDE($tbl),$tbl
   3297 	por	%xmm3,%xmm0
   3298 
   3299 	movq	%xmm0,($out)		# m0=bp[0]
   3300 	lea	8($out),$out
   3301 	sub	\$1,$num
   3302 	jnz	.Lgather
   3303 ___
   3304 $code.=<<___ if ($win64);
   3305 	movaps	(%rsp),%xmm6
   3306 	movaps	0x10(%rsp),%xmm7
   3307 	lea	0x28(%rsp),%rsp
   3308 ___
   3309 $code.=<<___;
   3310 	ret
   3311 .LSEH_end_bn_gather5:
   3312 .size	bn_gather5,.-bn_gather5
   3313 ___
   3314 }
   3315 $code.=<<___;
   3316 .align	64
   3317 .Lmagic_masks:
   3318 	.long	0,0, 0,0, 0,0, -1,-1
   3319 	.long	0,0, 0,0, 0,0,  0,0
   3320 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   3321 ___
   3322 
   3323 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   3324 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   3325 if ($win64) {
   3326 $rec="%rcx";
   3327 $frame="%rdx";
   3328 $context="%r8";
   3329 $disp="%r9";
   3330 
   3331 $code.=<<___;
   3332 .extern	__imp_RtlVirtualUnwind
   3333 .type	mul_handler,\@abi-omnipotent
   3334 .align	16
   3335 mul_handler:
   3336 	push	%rsi
   3337 	push	%rdi
   3338 	push	%rbx
   3339 	push	%rbp
   3340 	push	%r12
   3341 	push	%r13
   3342 	push	%r14
   3343 	push	%r15
   3344 	pushfq
   3345 	sub	\$64,%rsp
   3346 
   3347 	mov	120($context),%rax	# pull context->Rax
   3348 	mov	248($context),%rbx	# pull context->Rip
   3349 
   3350 	mov	8($disp),%rsi		# disp->ImageBase
   3351 	mov	56($disp),%r11		# disp->HandlerData
   3352 
   3353 	mov	0(%r11),%r10d		# HandlerData[0]
   3354 	lea	(%rsi,%r10),%r10	# end of prologue label
   3355 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   3356 	jb	.Lcommon_seh_tail
   3357 
   3358 	mov	152($context),%rax	# pull context->Rsp
   3359 
   3360 	mov	4(%r11),%r10d		# HandlerData[1]
   3361 	lea	(%rsi,%r10),%r10	# epilogue label
   3362 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   3363 	jae	.Lcommon_seh_tail
   3364 
   3365 	lea	.Lmul_epilogue(%rip),%r10
   3366 	cmp	%r10,%rbx
   3367 	jb	.Lbody_40
   3368 
   3369 	mov	192($context),%r10	# pull $num
   3370 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   3371 	jmp	.Lbody_proceed
   3372 
   3373 .Lbody_40:
   3374 	mov	40(%rax),%rax		# pull saved stack pointer
   3375 .Lbody_proceed:
   3376 
   3377 	movaps	-88(%rax),%xmm0
   3378 	movaps	-72(%rax),%xmm1
   3379 
   3380 	mov	-8(%rax),%rbx
   3381 	mov	-16(%rax),%rbp
   3382 	mov	-24(%rax),%r12
   3383 	mov	-32(%rax),%r13
   3384 	mov	-40(%rax),%r14
   3385 	mov	-48(%rax),%r15
   3386 	mov	%rbx,144($context)	# restore context->Rbx
   3387 	mov	%rbp,160($context)	# restore context->Rbp
   3388 	mov	%r12,216($context)	# restore context->R12
   3389 	mov	%r13,224($context)	# restore context->R13
   3390 	mov	%r14,232($context)	# restore context->R14
   3391 	mov	%r15,240($context)	# restore context->R15
   3392 	movups	%xmm0,512($context)	# restore context->Xmm6
   3393 	movups	%xmm1,528($context)	# restore context->Xmm7
   3394 
   3395 .Lcommon_seh_tail:
   3396 	mov	8(%rax),%rdi
   3397 	mov	16(%rax),%rsi
   3398 	mov	%rax,152($context)	# restore context->Rsp
   3399 	mov	%rsi,168($context)	# restore context->Rsi
   3400 	mov	%rdi,176($context)	# restore context->Rdi
   3401 
   3402 	mov	40($disp),%rdi		# disp->ContextRecord
   3403 	mov	$context,%rsi		# context
   3404 	mov	\$154,%ecx		# sizeof(CONTEXT)
   3405 	.long	0xa548f3fc		# cld; rep movsq
   3406 
   3407 	mov	$disp,%rsi
   3408 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   3409 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   3410 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   3411 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   3412 	mov	40(%rsi),%r10		# disp->ContextRecord
   3413 	lea	56(%rsi),%r11		# &disp->HandlerData
   3414 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   3415 	mov	%r10,32(%rsp)		# arg5
   3416 	mov	%r11,40(%rsp)		# arg6
   3417 	mov	%r12,48(%rsp)		# arg7
   3418 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   3419 	call	*__imp_RtlVirtualUnwind(%rip)
   3420 
   3421 	mov	\$1,%eax		# ExceptionContinueSearch
   3422 	add	\$64,%rsp
   3423 	popfq
   3424 	pop	%r15
   3425 	pop	%r14
   3426 	pop	%r13
   3427 	pop	%r12
   3428 	pop	%rbp
   3429 	pop	%rbx
   3430 	pop	%rdi
   3431 	pop	%rsi
   3432 	ret
   3433 .size	mul_handler,.-mul_handler
   3434 
   3435 .section	.pdata
   3436 .align	4
   3437 	.rva	.LSEH_begin_bn_mul_mont_gather5
   3438 	.rva	.LSEH_end_bn_mul_mont_gather5
   3439 	.rva	.LSEH_info_bn_mul_mont_gather5
   3440 
   3441 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
   3442 	.rva	.LSEH_end_bn_mul4x_mont_gather5
   3443 	.rva	.LSEH_info_bn_mul4x_mont_gather5
   3444 
   3445 	.rva	.LSEH_begin_bn_power5
   3446 	.rva	.LSEH_end_bn_power5
   3447 	.rva	.LSEH_info_bn_power5
   3448 
   3449 	.rva	.LSEH_begin_bn_from_mont8x
   3450 	.rva	.LSEH_end_bn_from_mont8x
   3451 	.rva	.LSEH_info_bn_from_mont8x
   3452 ___
   3453 $code.=<<___ if ($addx);
   3454 	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
   3455 	.rva	.LSEH_end_bn_mulx4x_mont_gather5
   3456 	.rva	.LSEH_info_bn_mulx4x_mont_gather5
   3457 
   3458 	.rva	.LSEH_begin_bn_powerx5
   3459 	.rva	.LSEH_end_bn_powerx5
   3460 	.rva	.LSEH_info_bn_powerx5
   3461 ___
   3462 $code.=<<___;
   3463 	.rva	.LSEH_begin_bn_gather5
   3464 	.rva	.LSEH_end_bn_gather5
   3465 	.rva	.LSEH_info_bn_gather5
   3466 
   3467 .section	.xdata
   3468 .align	8
   3469 .LSEH_info_bn_mul_mont_gather5:
   3470 	.byte	9,0,0,0
   3471 	.rva	mul_handler
   3472 	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
   3473 .align	8
   3474 .LSEH_info_bn_mul4x_mont_gather5:
   3475 	.byte	9,0,0,0
   3476 	.rva	mul_handler
   3477 	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
   3478 .align	8
   3479 .LSEH_info_bn_power5:
   3480 	.byte	9,0,0,0
   3481 	.rva	mul_handler
   3482 	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
   3483 .align	8
   3484 .LSEH_info_bn_from_mont8x:
   3485 	.byte	9,0,0,0
   3486 	.rva	mul_handler
   3487 	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
   3488 ___
   3489 $code.=<<___ if ($addx);
   3490 .align	8
   3491 .LSEH_info_bn_mulx4x_mont_gather5:
   3492 	.byte	9,0,0,0
   3493 	.rva	mul_handler
   3494 	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
   3495 .align	8
   3496 .LSEH_info_bn_powerx5:
   3497 	.byte	9,0,0,0
   3498 	.rva	mul_handler
   3499 	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
   3500 ___
   3501 $code.=<<___;
   3502 .align	8
   3503 .LSEH_info_bn_gather5:
   3504         .byte   0x01,0x0d,0x05,0x00
   3505         .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
   3506         .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
   3507         .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
   3508 .align	8
   3509 ___
   3510 }
   3511 
   3512 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3513 
   3514 print $code;
   3515 close STDOUT;
   3516