Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # August 2011.
     11 #
     12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
     13 # countermeasures. The subroutines are produced by replacing bp[i]
     14 # references in their x86_64-mont.pl counterparts with cache-neutral
     15 # references to powers table computed in BN_mod_exp_mont_consttime.
     16 # In addition subroutine that scatters elements of the powers table
     17 # is implemented, so that scatter-/gathering can be tuned without
     18 # bn_exp.c modifications.
     19 
     20 # August 2013.
     21 #
     22 # Add MULX/AD*X code paths and additional interfaces to optimize for
     23 # branch prediction unit. For input lengths that are multiples of 8
     24 # the np argument is not just modulus value, but one interleaved
     25 # with 0. This is to optimize post-condition...
     26 
     27 $flavour = shift;
     28 $output  = shift;
     29 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     30 
     31 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     32 
     33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     34 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     35 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     36 die "can't locate x86_64-xlate.pl";
     37 
     38 open OUT,"| \"$^X\" $xlate $flavour $output";
     39 *STDOUT=*OUT;
     40 
     41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     42 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
     43 	$addx = ($1>=2.23);
     44 }
     45 
     46 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     47 	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
     48 	$addx = ($1>=2.10);
     49 }
     50 
     51 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     52 	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
     53 	$addx = ($1>=12);
     54 }
     55 
     56 # int bn_mul_mont_gather5(
     57 $rp="%rdi";	# BN_ULONG *rp,
     58 $ap="%rsi";	# const BN_ULONG *ap,
     59 $bp="%rdx";	# const BN_ULONG *bp,
     60 $np="%rcx";	# const BN_ULONG *np,
     61 $n0="%r8";	# const BN_ULONG *n0,
     62 $num="%r9";	# int num,
     63 		# int idx);	# 0 to 2^5-1, "index" in $bp holding
     64 				# pre-computed powers of a', interlaced
     65 				# in such manner that b[0] is $bp[idx],
     66 				# b[1] is [2^5+idx], etc.
     67 $lo0="%r10";
     68 $hi0="%r11";
     69 $hi1="%r13";
     70 $i="%r14";
     71 $j="%r15";
     72 $m0="%rbx";
     73 $m1="%rbp";
     74 
     75 $code=<<___;
     76 .text
     77 
     78 .extern	OPENSSL_ia32cap_P
     79 
     80 .globl	bn_mul_mont_gather5
     81 .type	bn_mul_mont_gather5,\@function,6
     82 .align	64
     83 bn_mul_mont_gather5:
     84 	test	\$7,${num}d
     85 	jnz	.Lmul_enter
     86 ___
     87 $code.=<<___ if ($addx);
     88 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
     89 ___
     90 $code.=<<___;
     91 	jmp	.Lmul4x_enter
     92 
     93 .align	16
     94 .Lmul_enter:
     95 	mov	${num}d,${num}d
     96 	mov	%rsp,%rax
     97 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
     98 	push	%rbx
     99 	push	%rbp
    100 	push	%r12
    101 	push	%r13
    102 	push	%r14
    103 	push	%r15
    104 ___
    105 $code.=<<___ if ($win64);
    106 	lea	-0x28(%rsp),%rsp
    107 	movaps	%xmm6,(%rsp)
    108 	movaps	%xmm7,0x10(%rsp)
    109 ___
    110 $code.=<<___;
    111 	lea	2($num),%r11
    112 	neg	%r11
    113 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
    114 	and	\$-1024,%rsp		# minimize TLB usage
    115 
    116 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    117 .Lmul_body:
    118 	mov	$bp,%r12		# reassign $bp
    119 ___
    120 		$bp="%r12";
    121 		$STRIDE=2**5*8;		# 5 is "window size"
    122 		$N=$STRIDE/4;		# should match cache line size
    123 $code.=<<___;
    124 	mov	%r10,%r11
    125 	shr	\$`log($N/8)/log(2)`,%r10
    126 	and	\$`$N/8-1`,%r11
    127 	not	%r10
    128 	lea	.Lmagic_masks(%rip),%rax
    129 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    130 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    131 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    132 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    133 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    134 	movq	24(%rax,%r10,8),%xmm7
    135 
    136 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    137 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    138 	pand	%xmm4,%xmm0
    139 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    140 	pand	%xmm5,%xmm1
    141 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    142 	pand	%xmm6,%xmm2
    143 	por	%xmm1,%xmm0
    144 	pand	%xmm7,%xmm3
    145 	por	%xmm2,%xmm0
    146 	lea	$STRIDE($bp),$bp
    147 	por	%xmm3,%xmm0
    148 
    149 	movq	%xmm0,$m0		# m0=bp[0]
    150 
    151 	mov	($n0),$n0		# pull n0[0] value
    152 	mov	($ap),%rax
    153 
    154 	xor	$i,$i			# i=0
    155 	xor	$j,$j			# j=0
    156 
    157 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    158 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    159 	pand	%xmm4,%xmm0
    160 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    161 	pand	%xmm5,%xmm1
    162 
    163 	mov	$n0,$m1
    164 	mulq	$m0			# ap[0]*bp[0]
    165 	mov	%rax,$lo0
    166 	mov	($np),%rax
    167 
    168 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    169 	pand	%xmm6,%xmm2
    170 	por	%xmm1,%xmm0
    171 	pand	%xmm7,%xmm3
    172 
    173 	imulq	$lo0,$m1		# "tp[0]"*n0
    174 	mov	%rdx,$hi0
    175 
    176 	por	%xmm2,%xmm0
    177 	lea	$STRIDE($bp),$bp
    178 	por	%xmm3,%xmm0
    179 
    180 	mulq	$m1			# np[0]*m1
    181 	add	%rax,$lo0		# discarded
    182 	mov	8($ap),%rax
    183 	adc	\$0,%rdx
    184 	mov	%rdx,$hi1
    185 
    186 	lea	1($j),$j		# j++
    187 	jmp	.L1st_enter
    188 
    189 .align	16
    190 .L1st:
    191 	add	%rax,$hi1
    192 	mov	($ap,$j,8),%rax
    193 	adc	\$0,%rdx
    194 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    195 	mov	$lo0,$hi0
    196 	adc	\$0,%rdx
    197 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    198 	mov	%rdx,$hi1
    199 
    200 .L1st_enter:
    201 	mulq	$m0			# ap[j]*bp[0]
    202 	add	%rax,$hi0
    203 	mov	($np,$j,8),%rax
    204 	adc	\$0,%rdx
    205 	lea	1($j),$j		# j++
    206 	mov	%rdx,$lo0
    207 
    208 	mulq	$m1			# np[j]*m1
    209 	cmp	$num,$j
    210 	jne	.L1st
    211 
    212 	movq	%xmm0,$m0		# bp[1]
    213 
    214 	add	%rax,$hi1
    215 	mov	($ap),%rax		# ap[0]
    216 	adc	\$0,%rdx
    217 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    218 	adc	\$0,%rdx
    219 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    220 	mov	%rdx,$hi1
    221 	mov	$lo0,$hi0
    222 
    223 	xor	%rdx,%rdx
    224 	add	$hi0,$hi1
    225 	adc	\$0,%rdx
    226 	mov	$hi1,-8(%rsp,$num,8)
    227 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    228 
    229 	lea	1($i),$i		# i++
    230 	jmp	.Louter
    231 .align	16
    232 .Louter:
    233 	xor	$j,$j			# j=0
    234 	mov	$n0,$m1
    235 	mov	(%rsp),$lo0
    236 
    237 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    238 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    239 	pand	%xmm4,%xmm0
    240 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    241 	pand	%xmm5,%xmm1
    242 
    243 	mulq	$m0			# ap[0]*bp[i]
    244 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    245 	mov	($np),%rax
    246 	adc	\$0,%rdx
    247 
    248 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    249 	pand	%xmm6,%xmm2
    250 	por	%xmm1,%xmm0
    251 	pand	%xmm7,%xmm3
    252 
    253 	imulq	$lo0,$m1		# tp[0]*n0
    254 	mov	%rdx,$hi0
    255 
    256 	por	%xmm2,%xmm0
    257 	lea	$STRIDE($bp),$bp
    258 	por	%xmm3,%xmm0
    259 
    260 	mulq	$m1			# np[0]*m1
    261 	add	%rax,$lo0		# discarded
    262 	mov	8($ap),%rax
    263 	adc	\$0,%rdx
    264 	mov	8(%rsp),$lo0		# tp[1]
    265 	mov	%rdx,$hi1
    266 
    267 	lea	1($j),$j		# j++
    268 	jmp	.Linner_enter
    269 
    270 .align	16
    271 .Linner:
    272 	add	%rax,$hi1
    273 	mov	($ap,$j,8),%rax
    274 	adc	\$0,%rdx
    275 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    276 	mov	(%rsp,$j,8),$lo0
    277 	adc	\$0,%rdx
    278 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    279 	mov	%rdx,$hi1
    280 
    281 .Linner_enter:
    282 	mulq	$m0			# ap[j]*bp[i]
    283 	add	%rax,$hi0
    284 	mov	($np,$j,8),%rax
    285 	adc	\$0,%rdx
    286 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    287 	mov	%rdx,$hi0
    288 	adc	\$0,$hi0
    289 	lea	1($j),$j		# j++
    290 
    291 	mulq	$m1			# np[j]*m1
    292 	cmp	$num,$j
    293 	jne	.Linner
    294 
    295 	movq	%xmm0,$m0		# bp[i+1]
    296 
    297 	add	%rax,$hi1
    298 	mov	($ap),%rax		# ap[0]
    299 	adc	\$0,%rdx
    300 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    301 	mov	(%rsp,$j,8),$lo0
    302 	adc	\$0,%rdx
    303 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    304 	mov	%rdx,$hi1
    305 
    306 	xor	%rdx,%rdx
    307 	add	$hi0,$hi1
    308 	adc	\$0,%rdx
    309 	add	$lo0,$hi1		# pull upmost overflow bit
    310 	adc	\$0,%rdx
    311 	mov	$hi1,-8(%rsp,$num,8)
    312 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    313 
    314 	lea	1($i),$i		# i++
    315 	cmp	$num,$i
    316 	jb	.Louter
    317 
    318 	xor	$i,$i			# i=0 and clear CF!
    319 	mov	(%rsp),%rax		# tp[0]
    320 	lea	(%rsp),$ap		# borrow ap for tp
    321 	mov	$num,$j			# j=num
    322 	jmp	.Lsub
    323 .align	16
    324 .Lsub:	sbb	($np,$i,8),%rax
    325 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    326 	mov	8($ap,$i,8),%rax	# tp[i+1]
    327 	lea	1($i),$i		# i++
    328 	dec	$j			# doesnn't affect CF!
    329 	jnz	.Lsub
    330 
    331 	sbb	\$0,%rax		# handle upmost overflow bit
    332 	xor	$i,$i
    333 	mov	$num,$j			# j=num
    334 .align	16
    335 .Lcopy:					# copy or in-place refresh
    336 	mov	(%rsp,$i,8),$ap
    337 	mov	($rp,$i,8),$np
    338 	xor	$np,$ap			# conditional select:
    339 	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
    340 	xor	$np,$ap			# ap = borrow?tp:rp
    341 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    342 	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
    343 	lea	1($i),$i
    344 	sub	\$1,$j
    345 	jnz	.Lcopy
    346 
    347 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    348 	mov	\$1,%rax
    349 ___
    350 $code.=<<___ if ($win64);
    351 	movaps	-88(%rsi),%xmm6
    352 	movaps	-72(%rsi),%xmm7
    353 ___
    354 $code.=<<___;
    355 	mov	-48(%rsi),%r15
    356 	mov	-40(%rsi),%r14
    357 	mov	-32(%rsi),%r13
    358 	mov	-24(%rsi),%r12
    359 	mov	-16(%rsi),%rbp
    360 	mov	-8(%rsi),%rbx
    361 	lea	(%rsi),%rsp
    362 .Lmul_epilogue:
    363 	ret
    364 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    365 ___
    366 {{{
    367 my @A=("%r10","%r11");
    368 my @N=("%r13","%rdi");
    369 $code.=<<___;
    370 .type	bn_mul4x_mont_gather5,\@function,6
    371 .align	32
    372 bn_mul4x_mont_gather5:
    373 .Lmul4x_enter:
    374 ___
    375 $code.=<<___ if ($addx);
    376 	and	\$0x80100,%r11d
    377 	cmp	\$0x80100,%r11d
    378 	je	.Lmulx4x_enter
    379 ___
    380 $code.=<<___;
    381 	.byte	0x67
    382 	mov	%rsp,%rax
    383 	push	%rbx
    384 	push	%rbp
    385 	push	%r12
    386 	push	%r13
    387 	push	%r14
    388 	push	%r15
    389 ___
    390 $code.=<<___ if ($win64);
    391 	lea	-0x28(%rsp),%rsp
    392 	movaps	%xmm6,(%rsp)
    393 	movaps	%xmm7,0x10(%rsp)
    394 ___
    395 $code.=<<___;
    396 	.byte	0x67
    397 	mov	${num}d,%r10d
    398 	shl	\$3,${num}d
    399 	shl	\$3+2,%r10d		# 4*$num
    400 	neg	$num			# -$num
    401 
    402 	##############################################################
    403 	# ensure that stack frame doesn't alias with $aptr+4*$num
    404 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
    405 	# (see bn_exp.c). this is done to allow memory disambiguation
    406 	# logic do its magic. [excessive frame is allocated in order
    407 	# to allow bn_from_mont8x to clear it.]
    408 	#
    409 	lea	-64(%rsp,$num,2),%r11
    410 	sub	$ap,%r11
    411 	and	\$4095,%r11
    412 	cmp	%r11,%r10
    413 	jb	.Lmul4xsp_alt
    414 	sub	%r11,%rsp		# align with $ap
    415 	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
    416 	jmp	.Lmul4xsp_done
    417 
    418 .align	32
    419 .Lmul4xsp_alt:
    420 	lea	4096-64(,$num,2),%r10
    421 	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
    422 	sub	%r10,%r11
    423 	mov	\$0,%r10
    424 	cmovc	%r10,%r11
    425 	sub	%r11,%rsp
    426 .Lmul4xsp_done:
    427 	and	\$-64,%rsp
    428 	neg	$num
    429 
    430 	mov	%rax,40(%rsp)
    431 .Lmul4x_body:
    432 
    433 	call	mul4x_internal
    434 
    435 	mov	40(%rsp),%rsi		# restore %rsp
    436 	mov	\$1,%rax
    437 ___
    438 $code.=<<___ if ($win64);
    439 	movaps	-88(%rsi),%xmm6
    440 	movaps	-72(%rsi),%xmm7
    441 ___
    442 $code.=<<___;
    443 	mov	-48(%rsi),%r15
    444 	mov	-40(%rsi),%r14
    445 	mov	-32(%rsi),%r13
    446 	mov	-24(%rsi),%r12
    447 	mov	-16(%rsi),%rbp
    448 	mov	-8(%rsi),%rbx
    449 	lea	(%rsi),%rsp
    450 .Lmul4x_epilogue:
    451 	ret
    452 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    453 
    454 .type	mul4x_internal,\@abi-omnipotent
    455 .align	32
    456 mul4x_internal:
    457 	shl	\$5,$num
    458 	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
    459 	lea	256(%rdx,$num),%r13
    460 	shr	\$5,$num		# restore $num
    461 ___
    462 		$bp="%r12";
    463 		$STRIDE=2**5*8;		# 5 is "window size"
    464 		$N=$STRIDE/4;		# should match cache line size
    465 		$tp=$i;
    466 $code.=<<___;
    467 	mov	%r10,%r11
    468 	shr	\$`log($N/8)/log(2)`,%r10
    469 	and	\$`$N/8-1`,%r11
    470 	not	%r10
    471 	lea	.Lmagic_masks(%rip),%rax
    472 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    473 	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
    474 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    475 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    476 	add	\$7,%r11
    477 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    478 	movq	24(%rax,%r10,8),%xmm7
    479 	and	\$7,%r11
    480 
    481 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    482 	lea	$STRIDE($bp),$tp	# borrow $tp
    483 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    484 	pand	%xmm4,%xmm0
    485 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    486 	pand	%xmm5,%xmm1
    487 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    488 	pand	%xmm6,%xmm2
    489 	.byte	0x67
    490 	por	%xmm1,%xmm0
    491 	movq	`0*$STRIDE/4-96`($tp),%xmm1
    492 	.byte	0x67
    493 	pand	%xmm7,%xmm3
    494 	.byte	0x67
    495 	por	%xmm2,%xmm0
    496 	movq	`1*$STRIDE/4-96`($tp),%xmm2
    497 	.byte	0x67
    498 	pand	%xmm4,%xmm1
    499 	.byte	0x67
    500 	por	%xmm3,%xmm0
    501 	movq	`2*$STRIDE/4-96`($tp),%xmm3
    502 
    503 	movq	%xmm0,$m0		# m0=bp[0]
    504 	movq	`3*$STRIDE/4-96`($tp),%xmm0
    505 	mov	%r13,16+8(%rsp)		# save end of b[num]
    506 	mov	$rp, 56+8(%rsp)		# save $rp
    507 
    508 	mov	($n0),$n0		# pull n0[0] value
    509 	mov	($ap),%rax
    510 	lea	($ap,$num),$ap		# end of a[num]
    511 	neg	$num
    512 
    513 	mov	$n0,$m1
    514 	mulq	$m0			# ap[0]*bp[0]
    515 	mov	%rax,$A[0]
    516 	mov	($np),%rax
    517 
    518 	pand	%xmm5,%xmm2
    519 	pand	%xmm6,%xmm3
    520 	por	%xmm2,%xmm1
    521 
    522 	imulq	$A[0],$m1		# "tp[0]"*n0
    523 	##############################################################
    524 	# $tp is chosen so that writing to top-most element of the
    525 	# vector occurs just "above" references to powers table,
    526 	# "above" modulo cache-line size, which effectively precludes
    527 	# possibility of memory disambiguation logic failure when
    528 	# accessing the table.
    529 	# 
    530 	lea	64+8(%rsp,%r11,8),$tp
    531 	mov	%rdx,$A[1]
    532 
    533 	pand	%xmm7,%xmm0
    534 	por	%xmm3,%xmm1
    535 	lea	2*$STRIDE($bp),$bp
    536 	por	%xmm1,%xmm0
    537 
    538 	mulq	$m1			# np[0]*m1
    539 	add	%rax,$A[0]		# discarded
    540 	mov	8($ap,$num),%rax
    541 	adc	\$0,%rdx
    542 	mov	%rdx,$N[1]
    543 
    544 	mulq	$m0
    545 	add	%rax,$A[1]
    546 	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
    547 	adc	\$0,%rdx
    548 	mov	%rdx,$A[0]
    549 
    550 	mulq	$m1
    551 	add	%rax,$N[1]
    552 	mov	16($ap,$num),%rax
    553 	adc	\$0,%rdx
    554 	add	$A[1],$N[1]
    555 	lea	4*8($num),$j		# j=4
    556 	lea	16*4($np),$np
    557 	adc	\$0,%rdx
    558 	mov	$N[1],($tp)
    559 	mov	%rdx,$N[0]
    560 	jmp	.L1st4x
    561 
    562 .align	32
    563 .L1st4x:
    564 	mulq	$m0			# ap[j]*bp[0]
    565 	add	%rax,$A[0]
    566 	mov	-16*2($np),%rax
    567 	lea	32($tp),$tp
    568 	adc	\$0,%rdx
    569 	mov	%rdx,$A[1]
    570 
    571 	mulq	$m1			# np[j]*m1
    572 	add	%rax,$N[0]
    573 	mov	-8($ap,$j),%rax
    574 	adc	\$0,%rdx
    575 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    576 	adc	\$0,%rdx
    577 	mov	$N[0],-24($tp)		# tp[j-1]
    578 	mov	%rdx,$N[1]
    579 
    580 	mulq	$m0			# ap[j]*bp[0]
    581 	add	%rax,$A[1]
    582 	mov	-16*1($np),%rax
    583 	adc	\$0,%rdx
    584 	mov	%rdx,$A[0]
    585 
    586 	mulq	$m1			# np[j]*m1
    587 	add	%rax,$N[1]
    588 	mov	($ap,$j),%rax
    589 	adc	\$0,%rdx
    590 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    591 	adc	\$0,%rdx
    592 	mov	$N[1],-16($tp)		# tp[j-1]
    593 	mov	%rdx,$N[0]
    594 
    595 	mulq	$m0			# ap[j]*bp[0]
    596 	add	%rax,$A[0]
    597 	mov	16*0($np),%rax
    598 	adc	\$0,%rdx
    599 	mov	%rdx,$A[1]
    600 
    601 	mulq	$m1			# np[j]*m1
    602 	add	%rax,$N[0]
    603 	mov	8($ap,$j),%rax
    604 	adc	\$0,%rdx
    605 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    606 	adc	\$0,%rdx
    607 	mov	$N[0],-8($tp)		# tp[j-1]
    608 	mov	%rdx,$N[1]
    609 
    610 	mulq	$m0			# ap[j]*bp[0]
    611 	add	%rax,$A[1]
    612 	mov	16*1($np),%rax
    613 	adc	\$0,%rdx
    614 	mov	%rdx,$A[0]
    615 
    616 	mulq	$m1			# np[j]*m1
    617 	add	%rax,$N[1]
    618 	mov	16($ap,$j),%rax
    619 	adc	\$0,%rdx
    620 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    621 	lea	16*4($np),$np
    622 	adc	\$0,%rdx
    623 	mov	$N[1],($tp)		# tp[j-1]
    624 	mov	%rdx,$N[0]
    625 
    626 	add	\$32,$j			# j+=4
    627 	jnz	.L1st4x
    628 
    629 	mulq	$m0			# ap[j]*bp[0]
    630 	add	%rax,$A[0]
    631 	mov	-16*2($np),%rax
    632 	lea	32($tp),$tp
    633 	adc	\$0,%rdx
    634 	mov	%rdx,$A[1]
    635 
    636 	mulq	$m1			# np[j]*m1
    637 	add	%rax,$N[0]
    638 	mov	-8($ap),%rax
    639 	adc	\$0,%rdx
    640 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    641 	adc	\$0,%rdx
    642 	mov	$N[0],-24($tp)		# tp[j-1]
    643 	mov	%rdx,$N[1]
    644 
    645 	mulq	$m0			# ap[j]*bp[0]
    646 	add	%rax,$A[1]
    647 	mov	-16*1($np),%rax
    648 	adc	\$0,%rdx
    649 	mov	%rdx,$A[0]
    650 
    651 	mulq	$m1			# np[j]*m1
    652 	add	%rax,$N[1]
    653 	mov	($ap,$num),%rax		# ap[0]
    654 	adc	\$0,%rdx
    655 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    656 	adc	\$0,%rdx
    657 	mov	$N[1],-16($tp)		# tp[j-1]
    658 	mov	%rdx,$N[0]
    659 
    660 	movq	%xmm0,$m0		# bp[1]
    661 	lea	($np,$num,2),$np	# rewind $np
    662 
    663 	xor	$N[1],$N[1]
    664 	add	$A[0],$N[0]
    665 	adc	\$0,$N[1]
    666 	mov	$N[0],-8($tp)
    667 
    668 	jmp	.Louter4x
    669 
    670 .align	32
    671 .Louter4x:
    672 	mov	($tp,$num),$A[0]
    673 	mov	$n0,$m1
    674 	mulq	$m0			# ap[0]*bp[i]
    675 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    676 	mov	($np),%rax
    677 	adc	\$0,%rdx
    678 
    679 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    680 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    681 	pand	%xmm4,%xmm0
    682 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    683 	pand	%xmm5,%xmm1
    684 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    685 
    686 	imulq	$A[0],$m1		# tp[0]*n0
    687 	.byte	0x67
    688 	mov	%rdx,$A[1]
    689 	mov	$N[1],($tp)		# store upmost overflow bit
    690 
    691 	pand	%xmm6,%xmm2
    692 	por	%xmm1,%xmm0
    693 	pand	%xmm7,%xmm3
    694 	por	%xmm2,%xmm0
    695 	lea	($tp,$num),$tp		# rewind $tp
    696 	lea	$STRIDE($bp),$bp
    697 	por	%xmm3,%xmm0
    698 
    699 	mulq	$m1			# np[0]*m1
    700 	add	%rax,$A[0]		# "$N[0]", discarded
    701 	mov	8($ap,$num),%rax
    702 	adc	\$0,%rdx
    703 	mov	%rdx,$N[1]
    704 
    705 	mulq	$m0			# ap[j]*bp[i]
    706 	add	%rax,$A[1]
    707 	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
    708 	adc	\$0,%rdx
    709 	add	8($tp),$A[1]		# +tp[1]
    710 	adc	\$0,%rdx
    711 	mov	%rdx,$A[0]
    712 
    713 	mulq	$m1			# np[j]*m1
    714 	add	%rax,$N[1]
    715 	mov	16($ap,$num),%rax
    716 	adc	\$0,%rdx
    717 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    718 	lea	4*8($num),$j		# j=4
    719 	lea	16*4($np),$np
    720 	adc	\$0,%rdx
    721 	mov	%rdx,$N[0]
    722 	jmp	.Linner4x
    723 
    724 .align	32
    725 .Linner4x:
    726 	mulq	$m0			# ap[j]*bp[i]
    727 	add	%rax,$A[0]
    728 	mov	-16*2($np),%rax
    729 	adc	\$0,%rdx
    730 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    731 	lea	32($tp),$tp
    732 	adc	\$0,%rdx
    733 	mov	%rdx,$A[1]
    734 
    735 	mulq	$m1			# np[j]*m1
    736 	add	%rax,$N[0]
    737 	mov	-8($ap,$j),%rax
    738 	adc	\$0,%rdx
    739 	add	$A[0],$N[0]
    740 	adc	\$0,%rdx
    741 	mov	$N[1],-32($tp)		# tp[j-1]
    742 	mov	%rdx,$N[1]
    743 
    744 	mulq	$m0			# ap[j]*bp[i]
    745 	add	%rax,$A[1]
    746 	mov	-16*1($np),%rax
    747 	adc	\$0,%rdx
    748 	add	-8($tp),$A[1]
    749 	adc	\$0,%rdx
    750 	mov	%rdx,$A[0]
    751 
    752 	mulq	$m1			# np[j]*m1
    753 	add	%rax,$N[1]
    754 	mov	($ap,$j),%rax
    755 	adc	\$0,%rdx
    756 	add	$A[1],$N[1]
    757 	adc	\$0,%rdx
    758 	mov	$N[0],-24($tp)		# tp[j-1]
    759 	mov	%rdx,$N[0]
    760 
    761 	mulq	$m0			# ap[j]*bp[i]
    762 	add	%rax,$A[0]
    763 	mov	16*0($np),%rax
    764 	adc	\$0,%rdx
    765 	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    766 	adc	\$0,%rdx
    767 	mov	%rdx,$A[1]
    768 
    769 	mulq	$m1			# np[j]*m1
    770 	add	%rax,$N[0]
    771 	mov	8($ap,$j),%rax
    772 	adc	\$0,%rdx
    773 	add	$A[0],$N[0]
    774 	adc	\$0,%rdx
    775 	mov	$N[1],-16($tp)		# tp[j-1]
    776 	mov	%rdx,$N[1]
    777 
    778 	mulq	$m0			# ap[j]*bp[i]
    779 	add	%rax,$A[1]
    780 	mov	16*1($np),%rax
    781 	adc	\$0,%rdx
    782 	add	8($tp),$A[1]
    783 	adc	\$0,%rdx
    784 	mov	%rdx,$A[0]
    785 
    786 	mulq	$m1			# np[j]*m1
    787 	add	%rax,$N[1]
    788 	mov	16($ap,$j),%rax
    789 	adc	\$0,%rdx
    790 	add	$A[1],$N[1]
    791 	lea	16*4($np),$np
    792 	adc	\$0,%rdx
    793 	mov	$N[0],-8($tp)		# tp[j-1]
    794 	mov	%rdx,$N[0]
    795 
    796 	add	\$32,$j			# j+=4
    797 	jnz	.Linner4x
    798 
    799 	mulq	$m0			# ap[j]*bp[i]
    800 	add	%rax,$A[0]
    801 	mov	-16*2($np),%rax
    802 	adc	\$0,%rdx
    803 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    804 	lea	32($tp),$tp
    805 	adc	\$0,%rdx
    806 	mov	%rdx,$A[1]
    807 
    808 	mulq	$m1			# np[j]*m1
    809 	add	%rax,$N[0]
    810 	mov	-8($ap),%rax
    811 	adc	\$0,%rdx
    812 	add	$A[0],$N[0]
    813 	adc	\$0,%rdx
    814 	mov	$N[1],-32($tp)		# tp[j-1]
    815 	mov	%rdx,$N[1]
    816 
    817 	mulq	$m0			# ap[j]*bp[i]
    818 	add	%rax,$A[1]
    819 	mov	$m1,%rax
    820 	mov	-16*1($np),$m1
    821 	adc	\$0,%rdx
    822 	add	-8($tp),$A[1]
    823 	adc	\$0,%rdx
    824 	mov	%rdx,$A[0]
    825 
    826 	mulq	$m1			# np[j]*m1
    827 	add	%rax,$N[1]
    828 	mov	($ap,$num),%rax		# ap[0]
    829 	adc	\$0,%rdx
    830 	add	$A[1],$N[1]
    831 	adc	\$0,%rdx
    832 	mov	$N[0],-24($tp)		# tp[j-1]
    833 	mov	%rdx,$N[0]
    834 
    835 	movq	%xmm0,$m0		# bp[i+1]
    836 	mov	$N[1],-16($tp)		# tp[j-1]
    837 	lea	($np,$num,2),$np	# rewind $np
    838 
    839 	xor	$N[1],$N[1]
    840 	add	$A[0],$N[0]
    841 	adc	\$0,$N[1]
    842 	add	($tp),$N[0]		# pull upmost overflow bit
    843 	adc	\$0,$N[1]		# upmost overflow bit
    844 	mov	$N[0],-8($tp)
    845 
    846 	cmp	16+8(%rsp),$bp
    847 	jb	.Louter4x
    848 ___
    849 if (1) {
    850 $code.=<<___;
    851 	sub	$N[0],$m1		# compare top-most words
    852 	adc	$j,$j			# $j is zero
    853 	or	$j,$N[1]
    854 	xor	\$1,$N[1]
    855 	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
    856 	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
    857 	mov	%r9,%rcx
    858 	sar	\$3+2,%rcx		# cf=0
    859 	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
    860 	jmp	.Lsqr4x_sub
    861 ___
    862 } else {
    863 my @ri=("%rax",$bp,$m0,$m1);
    864 my $rp="%rdx";
    865 $code.=<<___
    866 	xor	\$1,$N[1]
    867 	lea	($tp,$num),$tp		# rewind $tp
    868 	sar	\$5,$num		# cf=0
    869 	lea	($np,$N[1],8),$np
    870 	mov	56+8(%rsp),$rp		# restore $rp
    871 	jmp	.Lsub4x
    872 
    873 .align	32
    874 .Lsub4x:
    875 	.byte	0x66
    876 	mov	8*0($tp),@ri[0]
    877 	mov	8*1($tp),@ri[1]
    878 	.byte	0x66
    879 	sbb	16*0($np),@ri[0]
    880 	mov	8*2($tp),@ri[2]
    881 	sbb	16*1($np),@ri[1]
    882 	mov	3*8($tp),@ri[3]
    883 	lea	4*8($tp),$tp
    884 	sbb	16*2($np),@ri[2]
    885 	mov	@ri[0],8*0($rp)
    886 	sbb	16*3($np),@ri[3]
    887 	lea	16*4($np),$np
    888 	mov	@ri[1],8*1($rp)
    889 	mov	@ri[2],8*2($rp)
    890 	mov	@ri[3],8*3($rp)
    891 	lea	8*4($rp),$rp
    892 
    893 	inc	$num
    894 	jnz	.Lsub4x
    895 
    896 	ret
    897 ___
    898 }
    899 $code.=<<___;
    900 .size	mul4x_internal,.-mul4x_internal
    901 ___
    902 }}}
    903 {{{
    905 ######################################################################
    906 # void bn_power5(
    907 my $rptr="%rdi";	# BN_ULONG *rptr,
    908 my $aptr="%rsi";	# const BN_ULONG *aptr,
    909 my $bptr="%rdx";	# const void *table,
    910 my $nptr="%rcx";	# const BN_ULONG *nptr,
    911 my $n0  ="%r8";		# const BN_ULONG *n0);
    912 my $num ="%r9";		# int num, has to be divisible by 8
    913 			# int pwr 
    914 
    915 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    916 my @A0=("%r10","%r11");
    917 my @A1=("%r12","%r13");
    918 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    919 
    920 $code.=<<___;
    921 .globl	bn_power5
    922 .type	bn_power5,\@function,6
    923 .align	32
    924 bn_power5:
    925 ___
    926 $code.=<<___ if ($addx);
    927 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
    928 	and	\$0x80100,%r11d
    929 	cmp	\$0x80100,%r11d
    930 	je	.Lpowerx5_enter
    931 ___
    932 $code.=<<___;
    933 	mov	%rsp,%rax
    934 	push	%rbx
    935 	push	%rbp
    936 	push	%r12
    937 	push	%r13
    938 	push	%r14
    939 	push	%r15
    940 ___
    941 $code.=<<___ if ($win64);
    942 	lea	-0x28(%rsp),%rsp
    943 	movaps	%xmm6,(%rsp)
    944 	movaps	%xmm7,0x10(%rsp)
    945 ___
    946 $code.=<<___;
    947 	mov	${num}d,%r10d
    948 	shl	\$3,${num}d		# convert $num to bytes
    949 	shl	\$3+2,%r10d		# 4*$num
    950 	neg	$num
    951 	mov	($n0),$n0		# *n0
    952 
    953 	##############################################################
    954 	# ensure that stack frame doesn't alias with $aptr+4*$num
    955 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
    956 	# (see bn_exp.c). this is done to allow memory disambiguation
    957 	# logic do its magic.
    958 	#
    959 	lea	-64(%rsp,$num,2),%r11
    960 	sub	$aptr,%r11
    961 	and	\$4095,%r11
    962 	cmp	%r11,%r10
    963 	jb	.Lpwr_sp_alt
    964 	sub	%r11,%rsp		# align with $aptr
    965 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
    966 	jmp	.Lpwr_sp_done
    967 
    968 .align	32
    969 .Lpwr_sp_alt:
    970 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
    971 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
    972 	sub	%r10,%r11
    973 	mov	\$0,%r10
    974 	cmovc	%r10,%r11
    975 	sub	%r11,%rsp
    976 .Lpwr_sp_done:
    977 	and	\$-64,%rsp
    978 	mov	$num,%r10	
    979 	neg	$num
    980 
    981 	##############################################################
    982 	# Stack layout
    983 	#
    984 	# +0	saved $num, used in reduction section
    985 	# +8	&t[2*$num], used in reduction section
    986 	# +32	saved *n0
    987 	# +40	saved %rsp
    988 	# +48	t[2*$num]
    989 	#
    990 	mov	$n0,  32(%rsp)
    991 	mov	%rax, 40(%rsp)		# save original %rsp
    992 .Lpower5_body:
    993 	movq	$rptr,%xmm1		# save $rptr
    994 	movq	$nptr,%xmm2		# save $nptr
    995 	movq	%r10, %xmm3		# -$num
    996 	movq	$bptr,%xmm4
    997 
    998 	call	__bn_sqr8x_internal
    999 	call	__bn_sqr8x_internal
   1000 	call	__bn_sqr8x_internal
   1001 	call	__bn_sqr8x_internal
   1002 	call	__bn_sqr8x_internal
   1003 
   1004 	movq	%xmm2,$nptr
   1005 	movq	%xmm4,$bptr
   1006 	mov	$aptr,$rptr
   1007 	mov	40(%rsp),%rax
   1008 	lea	32(%rsp),$n0
   1009 
   1010 	call	mul4x_internal
   1011 
   1012 	mov	40(%rsp),%rsi		# restore %rsp
   1013 	mov	\$1,%rax
   1014 	mov	-48(%rsi),%r15
   1015 	mov	-40(%rsi),%r14
   1016 	mov	-32(%rsi),%r13
   1017 	mov	-24(%rsi),%r12
   1018 	mov	-16(%rsi),%rbp
   1019 	mov	-8(%rsi),%rbx
   1020 	lea	(%rsi),%rsp
   1021 .Lpower5_epilogue:
   1022 	ret
   1023 .size	bn_power5,.-bn_power5
   1024 
   1025 .globl	bn_sqr8x_internal
   1026 .hidden	bn_sqr8x_internal
   1027 .type	bn_sqr8x_internal,\@abi-omnipotent
   1028 .align	32
   1029 bn_sqr8x_internal:
   1030 __bn_sqr8x_internal:
   1031 	##############################################################
   1032 	# Squaring part:
   1033 	#
   1034 	# a) multiply-n-add everything but a[i]*a[i];
   1035 	# b) shift result of a) by 1 to the left and accumulate
   1036 	#    a[i]*a[i] products;
   1037 	#
   1038 	##############################################################
   1039 	#                                                     a[1]a[0]
   1040 	#                                                 a[2]a[0]
   1041 	#                                             a[3]a[0]
   1042 	#                                             a[2]a[1]
   1043 	#                                         a[4]a[0]
   1044 	#                                         a[3]a[1]
   1045 	#                                     a[5]a[0]
   1046 	#                                     a[4]a[1]
   1047 	#                                     a[3]a[2]
   1048 	#                                 a[6]a[0]
   1049 	#                                 a[5]a[1]
   1050 	#                                 a[4]a[2]
   1051 	#                             a[7]a[0]
   1052 	#                             a[6]a[1]
   1053 	#                             a[5]a[2]
   1054 	#                             a[4]a[3]
   1055 	#                         a[7]a[1]
   1056 	#                         a[6]a[2]
   1057 	#                         a[5]a[3]
   1058 	#                     a[7]a[2]
   1059 	#                     a[6]a[3]
   1060 	#                     a[5]a[4]
   1061 	#                 a[7]a[3]
   1062 	#                 a[6]a[4]
   1063 	#             a[7]a[4]
   1064 	#             a[6]a[5]
   1065 	#         a[7]a[5]
   1066 	#     a[7]a[6]
   1067 	#                                                     a[1]a[0]
   1068 	#                                                 a[2]a[0]
   1069 	#                                             a[3]a[0]
   1070 	#                                         a[4]a[0]
   1071 	#                                     a[5]a[0]
   1072 	#                                 a[6]a[0]
   1073 	#                             a[7]a[0]
   1074 	#                                             a[2]a[1]
   1075 	#                                         a[3]a[1]
   1076 	#                                     a[4]a[1]
   1077 	#                                 a[5]a[1]
   1078 	#                             a[6]a[1]
   1079 	#                         a[7]a[1]
   1080 	#                                     a[3]a[2]
   1081 	#                                 a[4]a[2]
   1082 	#                             a[5]a[2]
   1083 	#                         a[6]a[2]
   1084 	#                     a[7]a[2]
   1085 	#                             a[4]a[3]
   1086 	#                         a[5]a[3]
   1087 	#                     a[6]a[3]
   1088 	#                 a[7]a[3]
   1089 	#                     a[5]a[4]
   1090 	#                 a[6]a[4]
   1091 	#             a[7]a[4]
   1092 	#             a[6]a[5]
   1093 	#         a[7]a[5]
   1094 	#     a[7]a[6]
   1095 	#                                                         a[0]a[0]
   1096 	#                                                 a[1]a[1]
   1097 	#                                         a[2]a[2]
   1098 	#                                 a[3]a[3]
   1099 	#                         a[4]a[4]
   1100 	#                 a[5]a[5]
   1101 	#         a[6]a[6]
   1102 	# a[7]a[7]
   1103 
   1104 	lea	32(%r10),$i		# $i=-($num-32)
   1105 	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
   1106 
   1107 	mov	$num,$j			# $j=$num
   1108 
   1109 					# comments apply to $num==8 case
   1110 	mov	-32($aptr,$i),$a0	# a[0]
   1111 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1112 	mov	-24($aptr,$i),%rax	# a[1]
   1113 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1114 	mov	-16($aptr,$i),$ai	# a[2]
   1115 	mov	%rax,$a1
   1116 
   1117 	mul	$a0			# a[1]*a[0]
   1118 	mov	%rax,$A0[0]		# a[1]*a[0]
   1119 	 mov	$ai,%rax		# a[2]
   1120 	mov	%rdx,$A0[1]
   1121 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1122 
   1123 	mul	$a0			# a[2]*a[0]
   1124 	add	%rax,$A0[1]
   1125 	 mov	$ai,%rax
   1126 	adc	\$0,%rdx
   1127 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1128 	mov	%rdx,$A0[0]
   1129 
   1130 
   1131 	 mov	-8($aptr,$i),$ai	# a[3]
   1132 	mul	$a1			# a[2]*a[1]
   1133 	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1134 	 mov	$ai,%rax
   1135 	mov	%rdx,$A1[1]
   1136 
   1137 	 lea	($i),$j
   1138 	mul	$a0			# a[3]*a[0]
   1139 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1140 	 mov	$ai,%rax
   1141 	mov	%rdx,$A0[1]
   1142 	adc	\$0,$A0[1]
   1143 	add	$A1[0],$A0[0]
   1144 	adc	\$0,$A0[1]
   1145 	mov	$A0[0],-8($tptr,$j)	# t[3]
   1146 	jmp	.Lsqr4x_1st
   1147 
   1148 .align	32
   1149 .Lsqr4x_1st:
   1150 	 mov	($aptr,$j),$ai		# a[4]
   1151 	mul	$a1			# a[3]*a[1]
   1152 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1153 	 mov	$ai,%rax
   1154 	mov	%rdx,$A1[0]
   1155 	adc	\$0,$A1[0]
   1156 
   1157 	mul	$a0			# a[4]*a[0]
   1158 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1159 	 mov	$ai,%rax		# a[3]
   1160 	 mov	8($aptr,$j),$ai		# a[5]
   1161 	mov	%rdx,$A0[0]
   1162 	adc	\$0,$A0[0]
   1163 	add	$A1[1],$A0[1]
   1164 	adc	\$0,$A0[0]
   1165 
   1166 
   1167 	mul	$a1			# a[4]*a[3]
   1168 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1169 	 mov	$ai,%rax
   1170 	 mov	$A0[1],($tptr,$j)	# t[4]
   1171 	mov	%rdx,$A1[1]
   1172 	adc	\$0,$A1[1]
   1173 
   1174 	mul	$a0			# a[5]*a[2]
   1175 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1176 	 mov	$ai,%rax
   1177 	 mov	16($aptr,$j),$ai	# a[6]
   1178 	mov	%rdx,$A0[1]
   1179 	adc	\$0,$A0[1]
   1180 	add	$A1[0],$A0[0]
   1181 	adc	\$0,$A0[1]
   1182 
   1183 	mul	$a1			# a[5]*a[3]
   1184 	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
   1185 	 mov	$ai,%rax
   1186 	 mov	$A0[0],8($tptr,$j)	# t[5]
   1187 	mov	%rdx,$A1[0]
   1188 	adc	\$0,$A1[0]
   1189 
   1190 	mul	$a0			# a[6]*a[2]
   1191 	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
   1192 	 mov	$ai,%rax		# a[3]
   1193 	 mov	24($aptr,$j),$ai	# a[7]
   1194 	mov	%rdx,$A0[0]
   1195 	adc	\$0,$A0[0]
   1196 	add	$A1[1],$A0[1]
   1197 	adc	\$0,$A0[0]
   1198 
   1199 
   1200 	mul	$a1			# a[6]*a[5]
   1201 	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
   1202 	 mov	$ai,%rax
   1203 	 mov	$A0[1],16($tptr,$j)	# t[6]
   1204 	mov	%rdx,$A1[1]
   1205 	adc	\$0,$A1[1]
   1206 	 lea	32($j),$j
   1207 
   1208 	mul	$a0			# a[7]*a[4]
   1209 	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
   1210 	 mov	$ai,%rax
   1211 	mov	%rdx,$A0[1]
   1212 	adc	\$0,$A0[1]
   1213 	add	$A1[0],$A0[0]
   1214 	adc	\$0,$A0[1]
   1215 	mov	$A0[0],-8($tptr,$j)	# t[7]
   1216 
   1217 	cmp	\$0,$j
   1218 	jne	.Lsqr4x_1st
   1219 
   1220 	mul	$a1			# a[7]*a[5]
   1221 	add	%rax,$A1[1]
   1222 	lea	16($i),$i
   1223 	adc	\$0,%rdx
   1224 	add	$A0[1],$A1[1]
   1225 	adc	\$0,%rdx
   1226 
   1227 	mov	$A1[1],($tptr)		# t[8]
   1228 	mov	%rdx,$A1[0]
   1229 	mov	%rdx,8($tptr)		# t[9]
   1230 	jmp	.Lsqr4x_outer
   1231 
   1232 .align	32
   1233 .Lsqr4x_outer:				# comments apply to $num==6 case
   1234 	mov	-32($aptr,$i),$a0	# a[0]
   1235 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1236 	mov	-24($aptr,$i),%rax	# a[1]
   1237 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1238 	mov	-16($aptr,$i),$ai	# a[2]
   1239 	mov	%rax,$a1
   1240 
   1241 	mul	$a0			# a[1]*a[0]
   1242 	mov	-24($tptr,$i),$A0[0]	# t[1]
   1243 	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
   1244 	 mov	$ai,%rax		# a[2]
   1245 	adc	\$0,%rdx
   1246 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1247 	mov	%rdx,$A0[1]
   1248 
   1249 	mul	$a0			# a[2]*a[0]
   1250 	add	%rax,$A0[1]
   1251 	 mov	$ai,%rax
   1252 	adc	\$0,%rdx
   1253 	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
   1254 	mov	%rdx,$A0[0]
   1255 	adc	\$0,$A0[0]
   1256 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1257 
   1258 	xor	$A1[0],$A1[0]
   1259 
   1260 	 mov	-8($aptr,$i),$ai	# a[3]
   1261 	mul	$a1			# a[2]*a[1]
   1262 	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1263 	 mov	$ai,%rax
   1264 	adc	\$0,%rdx
   1265 	add	-8($tptr,$i),$A1[0]
   1266 	mov	%rdx,$A1[1]
   1267 	adc	\$0,$A1[1]
   1268 
   1269 	mul	$a0			# a[3]*a[0]
   1270 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1271 	 mov	$ai,%rax
   1272 	adc	\$0,%rdx
   1273 	add	$A1[0],$A0[0]
   1274 	mov	%rdx,$A0[1]
   1275 	adc	\$0,$A0[1]
   1276 	mov	$A0[0],-8($tptr,$i)	# t[3]
   1277 
   1278 	lea	($i),$j
   1279 	jmp	.Lsqr4x_inner
   1280 
   1281 .align	32
   1282 .Lsqr4x_inner:
   1283 	 mov	($aptr,$j),$ai		# a[4]
   1284 	mul	$a1			# a[3]*a[1]
   1285 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1286 	 mov	$ai,%rax
   1287 	mov	%rdx,$A1[0]
   1288 	adc	\$0,$A1[0]
   1289 	add	($tptr,$j),$A1[1]
   1290 	adc	\$0,$A1[0]
   1291 
   1292 	.byte	0x67
   1293 	mul	$a0			# a[4]*a[0]
   1294 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1295 	 mov	$ai,%rax		# a[3]
   1296 	 mov	8($aptr,$j),$ai		# a[5]
   1297 	mov	%rdx,$A0[0]
   1298 	adc	\$0,$A0[0]
   1299 	add	$A1[1],$A0[1]
   1300 	adc	\$0,$A0[0]
   1301 
   1302 	mul	$a1			# a[4]*a[3]
   1303 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1304 	mov	$A0[1],($tptr,$j)	# t[4]
   1305 	 mov	$ai,%rax
   1306 	mov	%rdx,$A1[1]
   1307 	adc	\$0,$A1[1]
   1308 	add	8($tptr,$j),$A1[0]
   1309 	lea	16($j),$j		# j++
   1310 	adc	\$0,$A1[1]
   1311 
   1312 	mul	$a0			# a[5]*a[2]
   1313 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1314 	 mov	$ai,%rax
   1315 	adc	\$0,%rdx
   1316 	add	$A1[0],$A0[0]
   1317 	mov	%rdx,$A0[1]
   1318 	adc	\$0,$A0[1]
   1319 	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
   1320 
   1321 	cmp	\$0,$j
   1322 	jne	.Lsqr4x_inner
   1323 
   1324 	.byte	0x67
   1325 	mul	$a1			# a[5]*a[3]
   1326 	add	%rax,$A1[1]
   1327 	adc	\$0,%rdx
   1328 	add	$A0[1],$A1[1]
   1329 	adc	\$0,%rdx
   1330 
   1331 	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
   1332 	mov	%rdx,$A1[0]
   1333 	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
   1334 
   1335 	add	\$16,$i
   1336 	jnz	.Lsqr4x_outer
   1337 
   1338 					# comments apply to $num==4 case
   1339 	mov	-32($aptr),$a0		# a[0]
   1340 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1341 	mov	-24($aptr),%rax		# a[1]
   1342 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1343 	mov	-16($aptr),$ai		# a[2]
   1344 	mov	%rax,$a1
   1345 
   1346 	mul	$a0			# a[1]*a[0]
   1347 	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
   1348 	 mov	$ai,%rax		# a[2]
   1349 	mov	%rdx,$A0[1]
   1350 	adc	\$0,$A0[1]
   1351 
   1352 	mul	$a0			# a[2]*a[0]
   1353 	add	%rax,$A0[1]
   1354 	 mov	$ai,%rax
   1355 	 mov	$A0[0],-24($tptr)	# t[1]
   1356 	mov	%rdx,$A0[0]
   1357 	adc	\$0,$A0[0]
   1358 	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
   1359 	 mov	-8($aptr),$ai		# a[3]
   1360 	adc	\$0,$A0[0]
   1361 
   1362 	mul	$a1			# a[2]*a[1]
   1363 	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
   1364 	 mov	$ai,%rax
   1365 	 mov	$A0[1],-16($tptr)	# t[2]
   1366 	mov	%rdx,$A1[1]
   1367 	adc	\$0,$A1[1]
   1368 
   1369 	mul	$a0			# a[3]*a[0]
   1370 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1371 	 mov	$ai,%rax
   1372 	mov	%rdx,$A0[1]
   1373 	adc	\$0,$A0[1]
   1374 	add	$A1[0],$A0[0]
   1375 	adc	\$0,$A0[1]
   1376 	mov	$A0[0],-8($tptr)	# t[3]
   1377 
   1378 	mul	$a1			# a[3]*a[1]
   1379 	add	%rax,$A1[1]
   1380 	 mov	-16($aptr),%rax		# a[2]
   1381 	adc	\$0,%rdx
   1382 	add	$A0[1],$A1[1]
   1383 	adc	\$0,%rdx
   1384 
   1385 	mov	$A1[1],($tptr)		# t[4]
   1386 	mov	%rdx,$A1[0]
   1387 	mov	%rdx,8($tptr)		# t[5]
   1388 
   1389 	mul	$ai			# a[2]*a[3]
   1390 ___
   1391 {
   1392 my ($shift,$carry)=($a0,$a1);
   1393 my @S=(@A1,$ai,$n0);
   1394 $code.=<<___;
   1395 	 add	\$16,$i
   1396 	 xor	$shift,$shift
   1397 	 sub	$num,$i			# $i=16-$num
   1398 	 xor	$carry,$carry
   1399 
   1400 	add	$A1[0],%rax		# t[5]
   1401 	adc	\$0,%rdx
   1402 	mov	%rax,8($tptr)		# t[5]
   1403 	mov	%rdx,16($tptr)		# t[6]
   1404 	mov	$carry,24($tptr)	# t[7]
   1405 
   1406 	 mov	-16($aptr,$i),%rax	# a[0]
   1407 	lea	48+8(%rsp),$tptr
   1408 	 xor	$A0[0],$A0[0]		# t[0]
   1409 	 mov	8($tptr),$A0[1]		# t[1]
   1410 
   1411 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1412 	shr	\$63,$A0[0]
   1413 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1414 	shr	\$63,$A0[1]
   1415 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1416 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1417 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1418 	mul	%rax			# a[i]*a[i]
   1419 	neg	$carry			# mov $carry,cf
   1420 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1421 	adc	%rax,$S[0]
   1422 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1423 	mov	$S[0],($tptr)
   1424 	adc	%rdx,$S[1]
   1425 
   1426 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1427 	 mov	$S[1],8($tptr)
   1428 	 sbb	$carry,$carry		# mov cf,$carry
   1429 	shr	\$63,$A0[0]
   1430 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1431 	shr	\$63,$A0[1]
   1432 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1433 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1434 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1435 	mul	%rax			# a[i]*a[i]
   1436 	neg	$carry			# mov $carry,cf
   1437 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1438 	adc	%rax,$S[2]
   1439 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1440 	mov	$S[2],16($tptr)
   1441 	adc	%rdx,$S[3]
   1442 	lea	16($i),$i
   1443 	mov	$S[3],24($tptr)
   1444 	sbb	$carry,$carry		# mov cf,$carry
   1445 	lea	64($tptr),$tptr
   1446 	jmp	.Lsqr4x_shift_n_add
   1447 
   1448 .align	32
   1449 .Lsqr4x_shift_n_add:
   1450 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1451 	shr	\$63,$A0[0]
   1452 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1453 	shr	\$63,$A0[1]
   1454 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1455 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1456 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1457 	mul	%rax			# a[i]*a[i]
   1458 	neg	$carry			# mov $carry,cf
   1459 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1460 	adc	%rax,$S[0]
   1461 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1462 	mov	$S[0],-32($tptr)
   1463 	adc	%rdx,$S[1]
   1464 
   1465 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1466 	 mov	$S[1],-24($tptr)
   1467 	 sbb	$carry,$carry		# mov cf,$carry
   1468 	shr	\$63,$A0[0]
   1469 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1470 	shr	\$63,$A0[1]
   1471 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1472 	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
   1473 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1474 	mul	%rax			# a[i]*a[i]
   1475 	neg	$carry			# mov $carry,cf
   1476 	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
   1477 	adc	%rax,$S[2]
   1478 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1479 	mov	$S[2],-16($tptr)
   1480 	adc	%rdx,$S[3]
   1481 
   1482 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1483 	 mov	$S[3],-8($tptr)
   1484 	 sbb	$carry,$carry		# mov cf,$carry
   1485 	shr	\$63,$A0[0]
   1486 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1487 	shr	\$63,$A0[1]
   1488 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1489 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1490 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1491 	mul	%rax			# a[i]*a[i]
   1492 	neg	$carry			# mov $carry,cf
   1493 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1494 	adc	%rax,$S[0]
   1495 	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
   1496 	mov	$S[0],0($tptr)
   1497 	adc	%rdx,$S[1]
   1498 
   1499 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1500 	 mov	$S[1],8($tptr)
   1501 	 sbb	$carry,$carry		# mov cf,$carry
   1502 	shr	\$63,$A0[0]
   1503 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1504 	shr	\$63,$A0[1]
   1505 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1506 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1507 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1508 	mul	%rax			# a[i]*a[i]
   1509 	neg	$carry			# mov $carry,cf
   1510 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1511 	adc	%rax,$S[2]
   1512 	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
   1513 	mov	$S[2],16($tptr)
   1514 	adc	%rdx,$S[3]
   1515 	mov	$S[3],24($tptr)
   1516 	sbb	$carry,$carry		# mov cf,$carry
   1517 	lea	64($tptr),$tptr
   1518 	add	\$32,$i
   1519 	jnz	.Lsqr4x_shift_n_add
   1520 
   1521 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1522 	.byte	0x67
   1523 	shr	\$63,$A0[0]
   1524 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1525 	shr	\$63,$A0[1]
   1526 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1527 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1528 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1529 	mul	%rax			# a[i]*a[i]
   1530 	neg	$carry			# mov $carry,cf
   1531 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1532 	adc	%rax,$S[0]
   1533 	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
   1534 	mov	$S[0],-32($tptr)
   1535 	adc	%rdx,$S[1]
   1536 
   1537 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
   1538 	 mov	$S[1],-24($tptr)
   1539 	 sbb	$carry,$carry		# mov cf,$carry
   1540 	shr	\$63,$A0[0]
   1541 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1542 	shr	\$63,$A0[1]
   1543 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1544 	mul	%rax			# a[i]*a[i]
   1545 	neg	$carry			# mov $carry,cf
   1546 	adc	%rax,$S[2]
   1547 	adc	%rdx,$S[3]
   1548 	mov	$S[2],-16($tptr)
   1549 	mov	$S[3],-8($tptr)
   1550 ___
   1551 }
   1553 ######################################################################
   1554 # Montgomery reduction part, "word-by-word" algorithm.
   1555 #
   1556 # This new path is inspired by multiple submissions from Intel, by
   1557 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   1558 # Vinodh Gopal...
   1559 {
   1560 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
   1561 
   1562 $code.=<<___;
   1563 	movq	%xmm2,$nptr
   1564 sqr8x_reduction:
   1565 	xor	%rax,%rax
   1566 	lea	($nptr,$num,2),%rcx	# end of n[]
   1567 	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
   1568 	mov	%rcx,0+8(%rsp)
   1569 	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
   1570 	mov	%rdx,8+8(%rsp)
   1571 	neg	$num
   1572 	jmp	.L8x_reduction_loop
   1573 
   1574 .align	32
   1575 .L8x_reduction_loop:
   1576 	lea	($tptr,$num),$tptr	# start of current t[] window
   1577 	.byte	0x66
   1578 	mov	8*0($tptr),$m0
   1579 	mov	8*1($tptr),%r9
   1580 	mov	8*2($tptr),%r10
   1581 	mov	8*3($tptr),%r11
   1582 	mov	8*4($tptr),%r12
   1583 	mov	8*5($tptr),%r13
   1584 	mov	8*6($tptr),%r14
   1585 	mov	8*7($tptr),%r15
   1586 	mov	%rax,(%rdx)		# store top-most carry bit
   1587 	lea	8*8($tptr),$tptr
   1588 
   1589 	.byte	0x67
   1590 	mov	$m0,%r8
   1591 	imulq	32+8(%rsp),$m0		# n0*a[0]
   1592 	mov	16*0($nptr),%rax	# n[0]
   1593 	mov	\$8,%ecx
   1594 	jmp	.L8x_reduce
   1595 
   1596 .align	32
   1597 .L8x_reduce:
   1598 	mulq	$m0
   1599 	 mov	16*1($nptr),%rax	# n[1]
   1600 	neg	%r8
   1601 	mov	%rdx,%r8
   1602 	adc	\$0,%r8
   1603 
   1604 	mulq	$m0
   1605 	add	%rax,%r9
   1606 	 mov	16*2($nptr),%rax
   1607 	adc	\$0,%rdx
   1608 	add	%r9,%r8
   1609 	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
   1610 	mov	%rdx,%r9
   1611 	adc	\$0,%r9
   1612 
   1613 	mulq	$m0
   1614 	add	%rax,%r10
   1615 	 mov	16*3($nptr),%rax
   1616 	adc	\$0,%rdx
   1617 	add	%r10,%r9
   1618 	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
   1619 	mov	%rdx,%r10
   1620 	adc	\$0,%r10
   1621 
   1622 	mulq	$m0
   1623 	add	%rax,%r11
   1624 	 mov	16*4($nptr),%rax
   1625 	adc	\$0,%rdx
   1626 	 imulq	%r8,$carry		# modulo-scheduled
   1627 	add	%r11,%r10
   1628 	mov	%rdx,%r11
   1629 	adc	\$0,%r11
   1630 
   1631 	mulq	$m0
   1632 	add	%rax,%r12
   1633 	 mov	16*5($nptr),%rax
   1634 	adc	\$0,%rdx
   1635 	add	%r12,%r11
   1636 	mov	%rdx,%r12
   1637 	adc	\$0,%r12
   1638 
   1639 	mulq	$m0
   1640 	add	%rax,%r13
   1641 	 mov	16*6($nptr),%rax
   1642 	adc	\$0,%rdx
   1643 	add	%r13,%r12
   1644 	mov	%rdx,%r13
   1645 	adc	\$0,%r13
   1646 
   1647 	mulq	$m0
   1648 	add	%rax,%r14
   1649 	 mov	16*7($nptr),%rax
   1650 	adc	\$0,%rdx
   1651 	add	%r14,%r13
   1652 	mov	%rdx,%r14
   1653 	adc	\$0,%r14
   1654 
   1655 	mulq	$m0
   1656 	 mov	$carry,$m0		# n0*a[i]
   1657 	add	%rax,%r15
   1658 	 mov	16*0($nptr),%rax	# n[0]
   1659 	adc	\$0,%rdx
   1660 	add	%r15,%r14
   1661 	mov	%rdx,%r15
   1662 	adc	\$0,%r15
   1663 
   1664 	dec	%ecx
   1665 	jnz	.L8x_reduce
   1666 
   1667 	lea	16*8($nptr),$nptr
   1668 	xor	%rax,%rax
   1669 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1670 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1671 	jae	.L8x_no_tail
   1672 
   1673 	.byte	0x66
   1674 	add	8*0($tptr),%r8
   1675 	adc	8*1($tptr),%r9
   1676 	adc	8*2($tptr),%r10
   1677 	adc	8*3($tptr),%r11
   1678 	adc	8*4($tptr),%r12
   1679 	adc	8*5($tptr),%r13
   1680 	adc	8*6($tptr),%r14
   1681 	adc	8*7($tptr),%r15
   1682 	sbb	$carry,$carry		# top carry
   1683 
   1684 	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1685 	mov	\$8,%ecx
   1686 	mov	16*0($nptr),%rax
   1687 	jmp	.L8x_tail
   1688 
   1689 .align	32
   1690 .L8x_tail:
   1691 	mulq	$m0
   1692 	add	%rax,%r8
   1693 	 mov	16*1($nptr),%rax
   1694 	 mov	%r8,($tptr)		# save result
   1695 	mov	%rdx,%r8
   1696 	adc	\$0,%r8
   1697 
   1698 	mulq	$m0
   1699 	add	%rax,%r9
   1700 	 mov	16*2($nptr),%rax
   1701 	adc	\$0,%rdx
   1702 	add	%r9,%r8
   1703 	 lea	8($tptr),$tptr		# $tptr++
   1704 	mov	%rdx,%r9
   1705 	adc	\$0,%r9
   1706 
   1707 	mulq	$m0
   1708 	add	%rax,%r10
   1709 	 mov	16*3($nptr),%rax
   1710 	adc	\$0,%rdx
   1711 	add	%r10,%r9
   1712 	mov	%rdx,%r10
   1713 	adc	\$0,%r10
   1714 
   1715 	mulq	$m0
   1716 	add	%rax,%r11
   1717 	 mov	16*4($nptr),%rax
   1718 	adc	\$0,%rdx
   1719 	add	%r11,%r10
   1720 	mov	%rdx,%r11
   1721 	adc	\$0,%r11
   1722 
   1723 	mulq	$m0
   1724 	add	%rax,%r12
   1725 	 mov	16*5($nptr),%rax
   1726 	adc	\$0,%rdx
   1727 	add	%r12,%r11
   1728 	mov	%rdx,%r12
   1729 	adc	\$0,%r12
   1730 
   1731 	mulq	$m0
   1732 	add	%rax,%r13
   1733 	 mov	16*6($nptr),%rax
   1734 	adc	\$0,%rdx
   1735 	add	%r13,%r12
   1736 	mov	%rdx,%r13
   1737 	adc	\$0,%r13
   1738 
   1739 	mulq	$m0
   1740 	add	%rax,%r14
   1741 	 mov	16*7($nptr),%rax
   1742 	adc	\$0,%rdx
   1743 	add	%r14,%r13
   1744 	mov	%rdx,%r14
   1745 	adc	\$0,%r14
   1746 
   1747 	mulq	$m0
   1748 	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
   1749 	add	%rax,%r15
   1750 	adc	\$0,%rdx
   1751 	add	%r15,%r14
   1752 	 mov	16*0($nptr),%rax	# pull n[0]
   1753 	mov	%rdx,%r15
   1754 	adc	\$0,%r15
   1755 
   1756 	dec	%ecx
   1757 	jnz	.L8x_tail
   1758 
   1759 	lea	16*8($nptr),$nptr
   1760 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1761 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1762 	jae	.L8x_tail_done		# break out of loop
   1763 
   1764 	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1765 	neg	$carry
   1766 	 mov	8*0($nptr),%rax		# pull n[0]
   1767 	adc	8*0($tptr),%r8
   1768 	adc	8*1($tptr),%r9
   1769 	adc	8*2($tptr),%r10
   1770 	adc	8*3($tptr),%r11
   1771 	adc	8*4($tptr),%r12
   1772 	adc	8*5($tptr),%r13
   1773 	adc	8*6($tptr),%r14
   1774 	adc	8*7($tptr),%r15
   1775 	sbb	$carry,$carry		# top carry
   1776 
   1777 	mov	\$8,%ecx
   1778 	jmp	.L8x_tail
   1779 
   1780 .align	32
   1781 .L8x_tail_done:
   1782 	add	(%rdx),%r8		# can this overflow?
   1783 	xor	%rax,%rax
   1784 
   1785 	neg	$carry
   1786 .L8x_no_tail:
   1787 	adc	8*0($tptr),%r8
   1788 	adc	8*1($tptr),%r9
   1789 	adc	8*2($tptr),%r10
   1790 	adc	8*3($tptr),%r11
   1791 	adc	8*4($tptr),%r12
   1792 	adc	8*5($tptr),%r13
   1793 	adc	8*6($tptr),%r14
   1794 	adc	8*7($tptr),%r15
   1795 	adc	\$0,%rax		# top-most carry
   1796 	 mov	-16($nptr),%rcx		# np[num-1]
   1797 	 xor	$carry,$carry
   1798 
   1799 	movq	%xmm2,$nptr		# restore $nptr
   1800 
   1801 	mov	%r8,8*0($tptr)		# store top 512 bits
   1802 	mov	%r9,8*1($tptr)
   1803 	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
   1804 	mov	%r10,8*2($tptr)
   1805 	mov	%r11,8*3($tptr)
   1806 	mov	%r12,8*4($tptr)
   1807 	mov	%r13,8*5($tptr)
   1808 	mov	%r14,8*6($tptr)
   1809 	mov	%r15,8*7($tptr)
   1810 	lea	8*8($tptr),$tptr
   1811 
   1812 	cmp	%rdx,$tptr		# end of t[]?
   1813 	jb	.L8x_reduction_loop
   1814 ___
   1815 }
   1817 ##############################################################
   1818 # Post-condition, 4x unrolled
   1819 #
   1820 {
   1821 my ($tptr,$nptr)=("%rbx","%rbp");
   1822 $code.=<<___;
   1823 	#xor	%rsi,%rsi		# %rsi was $carry above
   1824 	sub	%r15,%rcx		# compare top-most words
   1825 	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
   1826 	adc	%rsi,%rsi
   1827 	mov	$num,%rcx
   1828 	or	%rsi,%rax
   1829 	movq	%xmm1,$rptr		# restore $rptr
   1830 	xor	\$1,%rax
   1831 	movq	%xmm1,$aptr		# prepare for back-to-back call
   1832 	lea	($nptr,%rax,8),$nptr
   1833 	sar	\$3+2,%rcx		# cf=0
   1834 	jmp	.Lsqr4x_sub
   1835 
   1836 .align	32
   1837 .Lsqr4x_sub:
   1838 	.byte	0x66
   1839 	mov	8*0($tptr),%r12
   1840 	mov	8*1($tptr),%r13
   1841 	sbb	16*0($nptr),%r12
   1842 	mov	8*2($tptr),%r14
   1843 	sbb	16*1($nptr),%r13
   1844 	mov	8*3($tptr),%r15
   1845 	lea	8*4($tptr),$tptr
   1846 	sbb	16*2($nptr),%r14
   1847 	mov	%r12,8*0($rptr)
   1848 	sbb	16*3($nptr),%r15
   1849 	lea	16*4($nptr),$nptr
   1850 	mov	%r13,8*1($rptr)
   1851 	mov	%r14,8*2($rptr)
   1852 	mov	%r15,8*3($rptr)
   1853 	lea	8*4($rptr),$rptr
   1854 
   1855 	inc	%rcx			# pass %cf
   1856 	jnz	.Lsqr4x_sub
   1857 ___
   1858 }
   1859 $code.=<<___;
   1860 	mov	$num,%r10		# prepare for back-to-back call
   1861 	neg	$num			# restore $num	
   1862 	ret
   1863 .size	bn_sqr8x_internal,.-bn_sqr8x_internal
   1864 ___
   1865 {
   1866 $code.=<<___;
   1867 .globl	bn_from_montgomery
   1868 .type	bn_from_montgomery,\@abi-omnipotent
   1869 .align	32
   1870 bn_from_montgomery:
   1871 	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
   1872 	jz	bn_from_mont8x
   1873 	xor	%eax,%eax
   1874 	ret
   1875 .size	bn_from_montgomery,.-bn_from_montgomery
   1876 
   1877 .type	bn_from_mont8x,\@function,6
   1878 .align	32
   1879 bn_from_mont8x:
   1880 	.byte	0x67
   1881 	mov	%rsp,%rax
   1882 	push	%rbx
   1883 	push	%rbp
   1884 	push	%r12
   1885 	push	%r13
   1886 	push	%r14
   1887 	push	%r15
   1888 ___
   1889 $code.=<<___ if ($win64);
   1890 	lea	-0x28(%rsp),%rsp
   1891 	movaps	%xmm6,(%rsp)
   1892 	movaps	%xmm7,0x10(%rsp)
   1893 ___
   1894 $code.=<<___;
   1895 	.byte	0x67
   1896 	mov	${num}d,%r10d
   1897 	shl	\$3,${num}d		# convert $num to bytes
   1898 	shl	\$3+2,%r10d		# 4*$num
   1899 	neg	$num
   1900 	mov	($n0),$n0		# *n0
   1901 
   1902 	##############################################################
   1903 	# ensure that stack frame doesn't alias with $aptr+4*$num
   1904 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
   1905 	# (see bn_exp.c). this is done to allow memory disambiguation
   1906 	# logic do its magic.
   1907 	#
   1908 	lea	-64(%rsp,$num,2),%r11
   1909 	sub	$aptr,%r11
   1910 	and	\$4095,%r11
   1911 	cmp	%r11,%r10
   1912 	jb	.Lfrom_sp_alt
   1913 	sub	%r11,%rsp		# align with $aptr
   1914 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   1915 	jmp	.Lfrom_sp_done
   1916 
   1917 .align	32
   1918 .Lfrom_sp_alt:
   1919 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
   1920 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   1921 	sub	%r10,%r11
   1922 	mov	\$0,%r10
   1923 	cmovc	%r10,%r11
   1924 	sub	%r11,%rsp
   1925 .Lfrom_sp_done:
   1926 	and	\$-64,%rsp
   1927 	mov	$num,%r10	
   1928 	neg	$num
   1929 
   1930 	##############################################################
   1931 	# Stack layout
   1932 	#
   1933 	# +0	saved $num, used in reduction section
   1934 	# +8	&t[2*$num], used in reduction section
   1935 	# +32	saved *n0
   1936 	# +40	saved %rsp
   1937 	# +48	t[2*$num]
   1938 	#
   1939 	mov	$n0,  32(%rsp)
   1940 	mov	%rax, 40(%rsp)		# save original %rsp
   1941 .Lfrom_body:
   1942 	mov	$num,%r11
   1943 	lea	48(%rsp),%rax
   1944 	pxor	%xmm0,%xmm0
   1945 	jmp	.Lmul_by_1
   1946 
   1947 .align	32
   1948 .Lmul_by_1:
   1949 	movdqu	($aptr),%xmm1
   1950 	movdqu	16($aptr),%xmm2
   1951 	movdqu	32($aptr),%xmm3
   1952 	movdqa	%xmm0,(%rax,$num)
   1953 	movdqu	48($aptr),%xmm4
   1954 	movdqa	%xmm0,16(%rax,$num)
   1955 	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
   1956 	movdqa	%xmm1,(%rax)
   1957 	movdqa	%xmm0,32(%rax,$num)
   1958 	movdqa	%xmm2,16(%rax)
   1959 	movdqa	%xmm0,48(%rax,$num)
   1960 	movdqa	%xmm3,32(%rax)
   1961 	movdqa	%xmm4,48(%rax)
   1962 	lea	64(%rax),%rax
   1963 	sub	\$64,%r11
   1964 	jnz	.Lmul_by_1
   1965 
   1966 	movq	$rptr,%xmm1
   1967 	movq	$nptr,%xmm2
   1968 	.byte	0x67
   1969 	mov	$nptr,%rbp
   1970 	movq	%r10, %xmm3		# -num
   1971 ___
   1972 $code.=<<___ if ($addx);
   1973 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
   1974 	and	\$0x80100,%r11d
   1975 	cmp	\$0x80100,%r11d
   1976 	jne	.Lfrom_mont_nox
   1977 
   1978 	lea	(%rax,$num),$rptr
   1979 	call	sqrx8x_reduction
   1980 
   1981 	pxor	%xmm0,%xmm0
   1982 	lea	48(%rsp),%rax
   1983 	mov	40(%rsp),%rsi		# restore %rsp
   1984 	jmp	.Lfrom_mont_zero
   1985 
   1986 .align	32
   1987 .Lfrom_mont_nox:
   1988 ___
   1989 $code.=<<___;
   1990 	call	sqr8x_reduction
   1991 
   1992 	pxor	%xmm0,%xmm0
   1993 	lea	48(%rsp),%rax
   1994 	mov	40(%rsp),%rsi		# restore %rsp
   1995 	jmp	.Lfrom_mont_zero
   1996 
   1997 .align	32
   1998 .Lfrom_mont_zero:
   1999 	movdqa	%xmm0,16*0(%rax)
   2000 	movdqa	%xmm0,16*1(%rax)
   2001 	movdqa	%xmm0,16*2(%rax)
   2002 	movdqa	%xmm0,16*3(%rax)
   2003 	lea	16*4(%rax),%rax
   2004 	sub	\$32,$num
   2005 	jnz	.Lfrom_mont_zero
   2006 
   2007 	mov	\$1,%rax
   2008 	mov	-48(%rsi),%r15
   2009 	mov	-40(%rsi),%r14
   2010 	mov	-32(%rsi),%r13
   2011 	mov	-24(%rsi),%r12
   2012 	mov	-16(%rsi),%rbp
   2013 	mov	-8(%rsi),%rbx
   2014 	lea	(%rsi),%rsp
   2015 .Lfrom_epilogue:
   2016 	ret
   2017 .size	bn_from_mont8x,.-bn_from_mont8x
   2018 ___
   2019 }
   2020 }}}
   2021 
   2023 if ($addx) {{{
   2024 my $bp="%rdx";	# restore original value
   2025 
   2026 $code.=<<___;
   2027 .type	bn_mulx4x_mont_gather5,\@function,6
   2028 .align	32
   2029 bn_mulx4x_mont_gather5:
   2030 .Lmulx4x_enter:
   2031 	.byte	0x67
   2032 	mov	%rsp,%rax
   2033 	push	%rbx
   2034 	push	%rbp
   2035 	push	%r12
   2036 	push	%r13
   2037 	push	%r14
   2038 	push	%r15
   2039 ___
   2040 $code.=<<___ if ($win64);
   2041 	lea	-0x28(%rsp),%rsp
   2042 	movaps	%xmm6,(%rsp)
   2043 	movaps	%xmm7,0x10(%rsp)
   2044 ___
   2045 $code.=<<___;
   2046 	.byte	0x67
   2047 	mov	${num}d,%r10d
   2048 	shl	\$3,${num}d		# convert $num to bytes
   2049 	shl	\$3+2,%r10d		# 4*$num
   2050 	neg	$num			# -$num
   2051 	mov	($n0),$n0		# *n0
   2052 
   2053 	##############################################################
   2054 	# ensure that stack frame doesn't alias with $aptr+4*$num
   2055 	# modulo 4096, which covers a[num], ret[num] and n[2*num]
   2056 	# (see bn_exp.c). this is done to allow memory disambiguation
   2057 	# logic do its magic. [excessive frame is allocated in order
   2058 	# to allow bn_from_mont8x to clear it.]
   2059 	#
   2060 	lea	-64(%rsp,$num,2),%r11
   2061 	sub	$ap,%r11
   2062 	and	\$4095,%r11
   2063 	cmp	%r11,%r10
   2064 	jb	.Lmulx4xsp_alt
   2065 	sub	%r11,%rsp		# align with $aptr
   2066 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
   2067 	jmp	.Lmulx4xsp_done
   2068 
   2069 .align	32
   2070 .Lmulx4xsp_alt:
   2071 	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
   2072 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
   2073 	sub	%r10,%r11
   2074 	mov	\$0,%r10
   2075 	cmovc	%r10,%r11
   2076 	sub	%r11,%rsp
   2077 .Lmulx4xsp_done:	
   2078 	and	\$-64,%rsp		# ensure alignment
   2079 	##############################################################
   2080 	# Stack layout
   2081 	# +0	-num
   2082 	# +8	off-loaded &b[i]
   2083 	# +16	end of b[num]
   2084 	# +24	inner counter
   2085 	# +32	saved n0
   2086 	# +40	saved %rsp
   2087 	# +48
   2088 	# +56	saved rp
   2089 	# +64	tmp[num+1]
   2090 	#
   2091 	mov	$n0, 32(%rsp)		# save *n0
   2092 	mov	%rax,40(%rsp)		# save original %rsp
   2093 .Lmulx4x_body:
   2094 	call	mulx4x_internal
   2095 
   2096 	mov	40(%rsp),%rsi		# restore %rsp
   2097 	mov	\$1,%rax
   2098 ___
   2099 $code.=<<___ if ($win64);
   2100 	movaps	-88(%rsi),%xmm6
   2101 	movaps	-72(%rsi),%xmm7
   2102 ___
   2103 $code.=<<___;
   2104 	mov	-48(%rsi),%r15
   2105 	mov	-40(%rsi),%r14
   2106 	mov	-32(%rsi),%r13
   2107 	mov	-24(%rsi),%r12
   2108 	mov	-16(%rsi),%rbp
   2109 	mov	-8(%rsi),%rbx
   2110 	lea	(%rsi),%rsp
   2111 .Lmulx4x_epilogue:
   2112 	ret
   2113 .size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
   2114 
   2115 .type	mulx4x_internal,\@abi-omnipotent
   2116 .align	32
   2117 mulx4x_internal:
   2118 	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
   2119 	.byte	0x67
   2120 	neg	$num			# restore $num
   2121 	shl	\$5,$num
   2122 	lea	256($bp,$num),%r13
   2123 	shr	\$5+5,$num
   2124 	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
   2125 	sub	\$1,$num
   2126 	mov	%r13,16+8(%rsp)		# end of b[num]
   2127 	mov	$num,24+8(%rsp)		# inner counter
   2128 	mov	$rp, 56+8(%rsp)		# save $rp
   2129 ___
   2130 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
   2131    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
   2132 my $rptr=$bptr;
   2133 my $STRIDE=2**5*8;		# 5 is "window size"
   2134 my $N=$STRIDE/4;		# should match cache line size
   2135 $code.=<<___;
   2136 	mov	%r10,%r11
   2137 	shr	\$`log($N/8)/log(2)`,%r10
   2138 	and	\$`$N/8-1`,%r11
   2139 	not	%r10
   2140 	lea	.Lmagic_masks(%rip),%rax
   2141 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
   2142 	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
   2143 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
   2144 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
   2145 	add	\$7,%r11
   2146 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
   2147 	movq	24(%rax,%r10,8),%xmm7
   2148 	and	\$7,%r11
   2149 
   2150 	movq	`0*$STRIDE/4-96`($bptr),%xmm0
   2151 	lea	$STRIDE($bptr),$tptr	# borrow $tptr
   2152 	movq	`1*$STRIDE/4-96`($bptr),%xmm1
   2153 	pand	%xmm4,%xmm0
   2154 	movq	`2*$STRIDE/4-96`($bptr),%xmm2
   2155 	pand	%xmm5,%xmm1
   2156 	movq	`3*$STRIDE/4-96`($bptr),%xmm3
   2157 	pand	%xmm6,%xmm2
   2158 	por	%xmm1,%xmm0
   2159 	movq	`0*$STRIDE/4-96`($tptr),%xmm1
   2160 	pand	%xmm7,%xmm3
   2161 	por	%xmm2,%xmm0
   2162 	movq	`1*$STRIDE/4-96`($tptr),%xmm2
   2163 	por	%xmm3,%xmm0
   2164 	.byte	0x67,0x67
   2165 	pand	%xmm4,%xmm1
   2166 	movq	`2*$STRIDE/4-96`($tptr),%xmm3
   2167 
   2168 	movq	%xmm0,%rdx		# bp[0]
   2169 	movq	`3*$STRIDE/4-96`($tptr),%xmm0
   2170 	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
   2171 	pand	%xmm5,%xmm2
   2172 	.byte	0x67,0x67
   2173 	pand	%xmm6,%xmm3
   2174 	##############################################################
   2175 	# $tptr is chosen so that writing to top-most element of the
   2176 	# vector occurs just "above" references to powers table,
   2177 	# "above" modulo cache-line size, which effectively precludes
   2178 	# possibility of memory disambiguation logic failure when
   2179 	# accessing the table.
   2180 	# 
   2181 	lea	64+8*4+8(%rsp,%r11,8),$tptr
   2182 
   2183 	mov	%rdx,$bi
   2184 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
   2185 	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
   2186 	add	%rax,%r11
   2187 	mulx	2*8($aptr),%rax,%r13	# ...
   2188 	adc	%rax,%r12
   2189 	adc	\$0,%r13
   2190 	mulx	3*8($aptr),%rax,%r14
   2191 
   2192 	mov	$mi,%r15
   2193 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2194 	xor	$zero,$zero		# cf=0, of=0
   2195 	mov	$mi,%rdx
   2196 
   2197 	por	%xmm2,%xmm1
   2198 	pand	%xmm7,%xmm0
   2199 	por	%xmm3,%xmm1
   2200 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2201 	por	%xmm1,%xmm0
   2202 
   2203 	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
   2204 	adcx	%rax,%r13
   2205 	adcx	$zero,%r14		# cf=0
   2206 
   2207 	mulx	0*16($nptr),%rax,%r10
   2208 	adcx	%rax,%r15		# discarded
   2209 	adox	%r11,%r10
   2210 	mulx	1*16($nptr),%rax,%r11
   2211 	adcx	%rax,%r10
   2212 	adox	%r12,%r11
   2213 	mulx	2*16($nptr),%rax,%r12
   2214 	mov	24+8(%rsp),$bptr	# counter value
   2215 	.byte	0x66
   2216 	mov	%r10,-8*4($tptr)
   2217 	adcx	%rax,%r11
   2218 	adox	%r13,%r12
   2219 	mulx	3*16($nptr),%rax,%r15
   2220 	 .byte	0x67,0x67
   2221 	 mov	$bi,%rdx
   2222 	mov	%r11,-8*3($tptr)
   2223 	adcx	%rax,%r12
   2224 	adox	$zero,%r15		# of=0
   2225 	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
   2226 	mov	%r12,-8*2($tptr)
   2227 	#jmp	.Lmulx4x_1st
   2228 
   2229 .align	32
   2230 .Lmulx4x_1st:
   2231 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2232 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
   2233 	adcx	%r14,%r10
   2234 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
   2235 	adcx	%rax,%r11
   2236 	mulx	2*8($aptr),%r12,%rax	# ...
   2237 	adcx	%r14,%r12
   2238 	mulx	3*8($aptr),%r13,%r14
   2239 	 .byte	0x67,0x67
   2240 	 mov	$mi,%rdx
   2241 	adcx	%rax,%r13
   2242 	adcx	$zero,%r14		# cf=0
   2243 	lea	4*8($aptr),$aptr
   2244 	lea	4*8($tptr),$tptr
   2245 
   2246 	adox	%r15,%r10
   2247 	mulx	0*16($nptr),%rax,%r15
   2248 	adcx	%rax,%r10
   2249 	adox	%r15,%r11
   2250 	mulx	1*16($nptr),%rax,%r15
   2251 	adcx	%rax,%r11
   2252 	adox	%r15,%r12
   2253 	mulx	2*16($nptr),%rax,%r15
   2254 	mov	%r10,-5*8($tptr)
   2255 	adcx	%rax,%r12
   2256 	mov	%r11,-4*8($tptr)
   2257 	adox	%r15,%r13
   2258 	mulx	3*16($nptr),%rax,%r15
   2259 	 mov	$bi,%rdx
   2260 	mov	%r12,-3*8($tptr)
   2261 	adcx	%rax,%r13
   2262 	adox	$zero,%r15
   2263 	lea	4*16($nptr),$nptr
   2264 	mov	%r13,-2*8($tptr)
   2265 
   2266 	dec	$bptr			# of=0, pass cf
   2267 	jnz	.Lmulx4x_1st
   2268 
   2269 	mov	8(%rsp),$num		# load -num
   2270 	movq	%xmm0,%rdx		# bp[1]
   2271 	adc	$zero,%r15		# modulo-scheduled
   2272 	lea	($aptr,$num),$aptr	# rewind $aptr
   2273 	add	%r15,%r14
   2274 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2275 	adc	$zero,$zero		# top-most carry
   2276 	mov	%r14,-1*8($tptr)
   2277 	jmp	.Lmulx4x_outer
   2278 
   2279 .align	32
   2280 .Lmulx4x_outer:
   2281 	mov	$zero,($tptr)		# save top-most carry
   2282 	lea	4*8($tptr,$num),$tptr	# rewind $tptr
   2283 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
   2284 	xor	$zero,$zero		# cf=0, of=0
   2285 	mov	%rdx,$bi
   2286 	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
   2287 	adox	-4*8($tptr),$mi		# +t[0]
   2288 	adcx	%r14,%r11
   2289 	mulx	2*8($aptr),%r15,%r13	# ...
   2290 	adox	-3*8($tptr),%r11
   2291 	adcx	%r15,%r12
   2292 	mulx	3*8($aptr),%rdx,%r14
   2293 	adox	-2*8($tptr),%r12
   2294 	adcx	%rdx,%r13
   2295 	lea	($nptr,$num,2),$nptr	# rewind $nptr
   2296 	lea	4*8($aptr),$aptr
   2297 	adox	-1*8($tptr),%r13
   2298 	adcx	$zero,%r14
   2299 	adox	$zero,%r14
   2300 
   2301 	.byte	0x67
   2302 	mov	$mi,%r15
   2303 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2304 
   2305 	movq	`0*$STRIDE/4-96`($bptr),%xmm0
   2306 	.byte	0x67,0x67
   2307 	mov	$mi,%rdx
   2308 	movq	`1*$STRIDE/4-96`($bptr),%xmm1
   2309 	.byte	0x67
   2310 	pand	%xmm4,%xmm0
   2311 	movq	`2*$STRIDE/4-96`($bptr),%xmm2
   2312 	.byte	0x67
   2313 	pand	%xmm5,%xmm1
   2314 	movq	`3*$STRIDE/4-96`($bptr),%xmm3
   2315 	add	\$$STRIDE,$bptr		# next &b[i]
   2316 	.byte	0x67
   2317 	pand	%xmm6,%xmm2
   2318 	por	%xmm1,%xmm0
   2319 	pand	%xmm7,%xmm3
   2320 	xor	$zero,$zero		# cf=0, of=0
   2321 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2322 
   2323 	mulx	0*16($nptr),%rax,%r10
   2324 	adcx	%rax,%r15		# discarded
   2325 	adox	%r11,%r10
   2326 	mulx	1*16($nptr),%rax,%r11
   2327 	adcx	%rax,%r10
   2328 	adox	%r12,%r11
   2329 	mulx	2*16($nptr),%rax,%r12
   2330 	adcx	%rax,%r11
   2331 	adox	%r13,%r12
   2332 	mulx	3*16($nptr),%rax,%r15
   2333 	 mov	$bi,%rdx
   2334 	 por	%xmm2,%xmm0
   2335 	mov	24+8(%rsp),$bptr	# counter value
   2336 	mov	%r10,-8*4($tptr)
   2337 	 por	%xmm3,%xmm0
   2338 	adcx	%rax,%r12
   2339 	mov	%r11,-8*3($tptr)
   2340 	adox	$zero,%r15		# of=0
   2341 	mov	%r12,-8*2($tptr)
   2342 	lea	4*16($nptr),$nptr
   2343 	jmp	.Lmulx4x_inner
   2344 
   2345 .align	32
   2346 .Lmulx4x_inner:
   2347 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
   2348 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2349 	adox	%r14,%r10
   2350 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
   2351 	adcx	0*8($tptr),%r10
   2352 	adox	%rax,%r11
   2353 	mulx	2*8($aptr),%r12,%rax	# ...
   2354 	adcx	1*8($tptr),%r11
   2355 	adox	%r14,%r12
   2356 	mulx	3*8($aptr),%r13,%r14
   2357 	 mov	$mi,%rdx
   2358 	adcx	2*8($tptr),%r12
   2359 	adox	%rax,%r13
   2360 	adcx	3*8($tptr),%r13
   2361 	adox	$zero,%r14		# of=0
   2362 	lea	4*8($aptr),$aptr
   2363 	lea	4*8($tptr),$tptr
   2364 	adcx	$zero,%r14		# cf=0
   2365 
   2366 	adox	%r15,%r10
   2367 	mulx	0*16($nptr),%rax,%r15
   2368 	adcx	%rax,%r10
   2369 	adox	%r15,%r11
   2370 	mulx	1*16($nptr),%rax,%r15
   2371 	adcx	%rax,%r11
   2372 	adox	%r15,%r12
   2373 	mulx	2*16($nptr),%rax,%r15
   2374 	mov	%r10,-5*8($tptr)
   2375 	adcx	%rax,%r12
   2376 	adox	%r15,%r13
   2377 	mov	%r11,-4*8($tptr)
   2378 	mulx	3*16($nptr),%rax,%r15
   2379 	 mov	$bi,%rdx
   2380 	lea	4*16($nptr),$nptr
   2381 	mov	%r12,-3*8($tptr)
   2382 	adcx	%rax,%r13
   2383 	adox	$zero,%r15
   2384 	mov	%r13,-2*8($tptr)
   2385 
   2386 	dec	$bptr			# of=0, pass cf
   2387 	jnz	.Lmulx4x_inner
   2388 
   2389 	mov	0+8(%rsp),$num		# load -num
   2390 	movq	%xmm0,%rdx		# bp[i+1]
   2391 	adc	$zero,%r15		# modulo-scheduled
   2392 	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
   2393 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2394 	mov	16+8(%rsp),%r10
   2395 	adc	%r15,%r14
   2396 	lea	($aptr,$num),$aptr	# rewind $aptr
   2397 	adc	$zero,$zero		# top-most carry
   2398 	mov	%r14,-1*8($tptr)
   2399 
   2400 	cmp	%r10,$bptr
   2401 	jb	.Lmulx4x_outer
   2402 
   2403 	mov	-16($nptr),%r10
   2404 	xor	%r15,%r15
   2405 	sub	%r14,%r10		# compare top-most words
   2406 	adc	%r15,%r15
   2407 	or	%r15,$zero
   2408 	xor	\$1,$zero
   2409 	lea	($tptr,$num),%rdi	# rewind $tptr
   2410 	lea	($nptr,$num,2),$nptr	# rewind $nptr
   2411 	.byte	0x67,0x67
   2412 	sar	\$3+2,$num		# cf=0
   2413 	lea	($nptr,$zero,8),%rbp
   2414 	mov	56+8(%rsp),%rdx		# restore rp
   2415 	mov	$num,%rcx
   2416 	jmp	.Lsqrx4x_sub		# common post-condition
   2417 .size	mulx4x_internal,.-mulx4x_internal
   2418 ___
   2419 }{
   2421 ######################################################################
   2422 # void bn_power5(
   2423 my $rptr="%rdi";	# BN_ULONG *rptr,
   2424 my $aptr="%rsi";	# const BN_ULONG *aptr,
   2425 my $bptr="%rdx";	# const void *table,
   2426 my $nptr="%rcx";	# const BN_ULONG *nptr,
   2427 my $n0  ="%r8";		# const BN_ULONG *n0);
   2428 my $num ="%r9";		# int num, has to be divisible by 8
   2429 			# int pwr);
   2430 
   2431 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
   2432 my @A0=("%r10","%r11");
   2433 my @A1=("%r12","%r13");
   2434 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
   2435 
   2436 $code.=<<___;
   2437 .type	bn_powerx5,\@function,6
   2438 .align	32
   2439 bn_powerx5:
   2440 .Lpowerx5_enter:
   2441 	.byte	0x67
   2442 	mov	%rsp,%rax
   2443 	push	%rbx
   2444 	push	%rbp
   2445 	push	%r12
   2446 	push	%r13
   2447 	push	%r14
   2448 	push	%r15
   2449 ___
   2450 $code.=<<___ if ($win64);
   2451 	lea	-0x28(%rsp),%rsp
   2452 	movaps	%xmm6,(%rsp)
   2453 	movaps	%xmm7,0x10(%rsp)
   2454 ___
   2455 $code.=<<___;
   2456 	.byte	0x67
   2457 	mov	${num}d,%r10d
   2458 	shl	\$3,${num}d		# convert $num to bytes
   2459 	shl	\$3+2,%r10d		# 4*$num
   2460 	neg	$num
   2461 	mov	($n0),$n0		# *n0
   2462 
   2463 	##############################################################
   2464 	# ensure that stack frame doesn't alias with $aptr+4*$num
   2465 	# modulo 4096, which covers ret[num], am[num] and n[2*num]
   2466 	# (see bn_exp.c). this is done to allow memory disambiguation
   2467 	# logic do its magic.
   2468 	#
   2469 	lea	-64(%rsp,$num,2),%r11
   2470 	sub	$aptr,%r11
   2471 	and	\$4095,%r11
   2472 	cmp	%r11,%r10
   2473 	jb	.Lpwrx_sp_alt
   2474 	sub	%r11,%rsp		# align with $aptr
   2475 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   2476 	jmp	.Lpwrx_sp_done
   2477 
   2478 .align	32
   2479 .Lpwrx_sp_alt:
   2480 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
   2481 	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
   2482 	sub	%r10,%r11
   2483 	mov	\$0,%r10
   2484 	cmovc	%r10,%r11
   2485 	sub	%r11,%rsp
   2486 .Lpwrx_sp_done:
   2487 	and	\$-64,%rsp
   2488 	mov	$num,%r10	
   2489 	neg	$num
   2490 
   2491 	##############################################################
   2492 	# Stack layout
   2493 	#
   2494 	# +0	saved $num, used in reduction section
   2495 	# +8	&t[2*$num], used in reduction section
   2496 	# +16	intermediate carry bit
   2497 	# +24	top-most carry bit, used in reduction section
   2498 	# +32	saved *n0
   2499 	# +40	saved %rsp
   2500 	# +48	t[2*$num]
   2501 	#
   2502 	pxor	%xmm0,%xmm0
   2503 	movq	$rptr,%xmm1		# save $rptr
   2504 	movq	$nptr,%xmm2		# save $nptr
   2505 	movq	%r10, %xmm3		# -$num
   2506 	movq	$bptr,%xmm4
   2507 	mov	$n0,  32(%rsp)
   2508 	mov	%rax, 40(%rsp)		# save original %rsp
   2509 .Lpowerx5_body:
   2510 
   2511 	call	__bn_sqrx8x_internal
   2512 	call	__bn_sqrx8x_internal
   2513 	call	__bn_sqrx8x_internal
   2514 	call	__bn_sqrx8x_internal
   2515 	call	__bn_sqrx8x_internal
   2516 
   2517 	mov	%r10,$num		# -num
   2518 	mov	$aptr,$rptr
   2519 	movq	%xmm2,$nptr
   2520 	movq	%xmm4,$bptr
   2521 	mov	40(%rsp),%rax
   2522 
   2523 	call	mulx4x_internal
   2524 
   2525 	mov	40(%rsp),%rsi		# restore %rsp
   2526 	mov	\$1,%rax
   2527 ___
   2528 $code.=<<___ if ($win64);
   2529 	movaps	-88(%rsi),%xmm6
   2530 	movaps	-72(%rsi),%xmm7
   2531 ___
   2532 $code.=<<___;
   2533 	mov	-48(%rsi),%r15
   2534 	mov	-40(%rsi),%r14
   2535 	mov	-32(%rsi),%r13
   2536 	mov	-24(%rsi),%r12
   2537 	mov	-16(%rsi),%rbp
   2538 	mov	-8(%rsi),%rbx
   2539 	lea	(%rsi),%rsp
   2540 .Lpowerx5_epilogue:
   2541 	ret
   2542 .size	bn_powerx5,.-bn_powerx5
   2543 
   2544 .globl	bn_sqrx8x_internal
   2545 .hidden	bn_sqrx8x_internal
   2546 .type	bn_sqrx8x_internal,\@abi-omnipotent
   2547 .align	32
   2548 bn_sqrx8x_internal:
   2549 __bn_sqrx8x_internal:
   2550 	##################################################################
   2551 	# Squaring part:
   2552 	#
   2553 	# a) multiply-n-add everything but a[i]*a[i];
   2554 	# b) shift result of a) by 1 to the left and accumulate
   2555 	#    a[i]*a[i] products;
   2556 	#
   2557 	##################################################################
   2558 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2559 	#                                                     a[1]a[0]
   2560 	#                                                 a[2]a[0]
   2561 	#                                             a[3]a[0]
   2562 	#                                             a[2]a[1]
   2563 	#                                         a[3]a[1]
   2564 	#                                     a[3]a[2]
   2565 	#
   2566 	#                                         a[4]a[0]
   2567 	#                                     a[5]a[0]
   2568 	#                                 a[6]a[0]
   2569 	#                             a[7]a[0]
   2570 	#                                     a[4]a[1]
   2571 	#                                 a[5]a[1]
   2572 	#                             a[6]a[1]
   2573 	#                         a[7]a[1]
   2574 	#                                 a[4]a[2]
   2575 	#                             a[5]a[2]
   2576 	#                         a[6]a[2]
   2577 	#                     a[7]a[2]
   2578 	#                             a[4]a[3]
   2579 	#                         a[5]a[3]
   2580 	#                     a[6]a[3]
   2581 	#                 a[7]a[3]
   2582 	#
   2583 	#                     a[5]a[4]
   2584 	#                 a[6]a[4]
   2585 	#             a[7]a[4]
   2586 	#             a[6]a[5]
   2587 	#         a[7]a[5]
   2588 	#     a[7]a[6]
   2589 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2590 ___
   2591 {
   2592 my ($zero,$carry)=("%rbp","%rcx");
   2593 my $aaptr=$zero;
   2594 $code.=<<___;
   2595 	lea	48+8(%rsp),$tptr
   2596 	lea	($aptr,$num),$aaptr
   2597 	mov	$num,0+8(%rsp)			# save $num
   2598 	mov	$aaptr,8+8(%rsp)		# save end of $aptr
   2599 	jmp	.Lsqr8x_zero_start
   2600 
   2601 .align	32
   2602 .byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
   2603 .Lsqrx8x_zero:
   2604 	.byte	0x3e
   2605 	movdqa	%xmm0,0*8($tptr)
   2606 	movdqa	%xmm0,2*8($tptr)
   2607 	movdqa	%xmm0,4*8($tptr)
   2608 	movdqa	%xmm0,6*8($tptr)
   2609 .Lsqr8x_zero_start:			# aligned at 32
   2610 	movdqa	%xmm0,8*8($tptr)
   2611 	movdqa	%xmm0,10*8($tptr)
   2612 	movdqa	%xmm0,12*8($tptr)
   2613 	movdqa	%xmm0,14*8($tptr)
   2614 	lea	16*8($tptr),$tptr
   2615 	sub	\$64,$num
   2616 	jnz	.Lsqrx8x_zero
   2617 
   2618 	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
   2619 	#xor	%r9,%r9			# t[1], ex-$num, zero already
   2620 	xor	%r10,%r10
   2621 	xor	%r11,%r11
   2622 	xor	%r12,%r12
   2623 	xor	%r13,%r13
   2624 	xor	%r14,%r14
   2625 	xor	%r15,%r15
   2626 	lea	48+8(%rsp),$tptr
   2627 	xor	$zero,$zero		# cf=0, cf=0
   2628 	jmp	.Lsqrx8x_outer_loop
   2629 
   2630 .align	32
   2631 .Lsqrx8x_outer_loop:
   2632 	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
   2633 	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
   2634 	adox	%rax,%r10
   2635 	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
   2636 	adcx	%r10,%r9
   2637 	adox	%rax,%r11
   2638 	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
   2639 	adcx	%r11,%r10
   2640 	adox	%rax,%r12
   2641 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
   2642 	adcx	%r12,%r11
   2643 	adox	%rax,%r13
   2644 	mulx	5*8($aptr),%r12,%rax
   2645 	adcx	%r13,%r12
   2646 	adox	%rax,%r14
   2647 	mulx	6*8($aptr),%r13,%rax
   2648 	adcx	%r14,%r13
   2649 	adox	%r15,%rax
   2650 	mulx	7*8($aptr),%r14,%r15
   2651 	 mov	1*8($aptr),%rdx		# a[1]
   2652 	adcx	%rax,%r14
   2653 	adox	$zero,%r15
   2654 	adc	8*8($tptr),%r15
   2655 	mov	%r8,1*8($tptr)		# t[1]
   2656 	mov	%r9,2*8($tptr)		# t[2]
   2657 	sbb	$carry,$carry		# mov %cf,$carry
   2658 	xor	$zero,$zero		# cf=0, of=0
   2659 
   2660 
   2661 	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
   2662 	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
   2663 	adcx	%r10,%r8
   2664 	adox	%rbx,%r9
   2665 	mulx	4*8($aptr),%r10,%rbx	# ...
   2666 	adcx	%r11,%r9
   2667 	adox	%rax,%r10
   2668 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
   2669 	adcx	%r12,%r10
   2670 	adox	%rbx,%r11
   2671 	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
   2672 	adcx	%r13,%r11
   2673 	adox	%r14,%r12
   2674 	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
   2675 	 mov	2*8($aptr),%rdx		# a[2]
   2676 	adcx	%rax,%r12
   2677 	adox	%rbx,%r13
   2678 	adcx	%r15,%r13
   2679 	adox	$zero,%r14		# of=0
   2680 	adcx	$zero,%r14		# cf=0
   2681 
   2682 	mov	%r8,3*8($tptr)		# t[3]
   2683 	mov	%r9,4*8($tptr)		# t[4]
   2684 
   2685 	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
   2686 	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
   2687 	adcx	%r10,%r8
   2688 	adox	%rbx,%r9
   2689 	mulx	5*8($aptr),%r10,%rbx	# ...
   2690 	adcx	%r11,%r9
   2691 	adox	%rax,%r10
   2692 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
   2693 	adcx	%r12,%r10
   2694 	adox	%r13,%r11
   2695 	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
   2696 	.byte	0x3e
   2697 	 mov	3*8($aptr),%rdx		# a[3]
   2698 	adcx	%rbx,%r11
   2699 	adox	%rax,%r12
   2700 	adcx	%r14,%r12
   2701 	mov	%r8,5*8($tptr)		# t[5]
   2702 	mov	%r9,6*8($tptr)		# t[6]
   2703 	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
   2704 	adox	$zero,%r13		# of=0
   2705 	adcx	$zero,%r13		# cf=0
   2706 
   2707 	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
   2708 	adcx	%r10,%r8
   2709 	adox	%rax,%r9
   2710 	mulx	6*8($aptr),%r10,%rax	# ...
   2711 	adcx	%r11,%r9
   2712 	adox	%r12,%r10
   2713 	mulx	7*8($aptr),%r11,%r12
   2714 	 mov	4*8($aptr),%rdx		# a[4]
   2715 	 mov	5*8($aptr),%r14		# a[5]
   2716 	adcx	%rbx,%r10
   2717 	adox	%rax,%r11
   2718 	 mov	6*8($aptr),%r15		# a[6]
   2719 	adcx	%r13,%r11
   2720 	adox	$zero,%r12		# of=0
   2721 	adcx	$zero,%r12		# cf=0
   2722 
   2723 	mov	%r8,7*8($tptr)		# t[7]
   2724 	mov	%r9,8*8($tptr)		# t[8]
   2725 
   2726 	mulx	%r14,%r9,%rax		# a[5]*a[4]
   2727 	 mov	7*8($aptr),%r8		# a[7]
   2728 	adcx	%r10,%r9
   2729 	mulx	%r15,%r10,%rbx		# a[6]*a[4]
   2730 	adox	%rax,%r10
   2731 	adcx	%r11,%r10
   2732 	mulx	%r8,%r11,%rax		# a[7]*a[4]
   2733 	 mov	%r14,%rdx		# a[5]
   2734 	adox	%rbx,%r11
   2735 	adcx	%r12,%r11
   2736 	#adox	$zero,%rax		# of=0
   2737 	adcx	$zero,%rax		# cf=0
   2738 
   2739 	mulx	%r15,%r14,%rbx		# a[6]*a[5]
   2740 	mulx	%r8,%r12,%r13		# a[7]*a[5]
   2741 	 mov	%r15,%rdx		# a[6]
   2742 	 lea	8*8($aptr),$aptr
   2743 	adcx	%r14,%r11
   2744 	adox	%rbx,%r12
   2745 	adcx	%rax,%r12
   2746 	adox	$zero,%r13
   2747 
   2748 	.byte	0x67,0x67
   2749 	mulx	%r8,%r8,%r14		# a[7]*a[6]
   2750 	adcx	%r8,%r13
   2751 	adcx	$zero,%r14
   2752 
   2753 	cmp	8+8(%rsp),$aptr
   2754 	je	.Lsqrx8x_outer_break
   2755 
   2756 	neg	$carry			# mov $carry,%cf
   2757 	mov	\$-8,%rcx
   2758 	mov	$zero,%r15
   2759 	mov	8*8($tptr),%r8
   2760 	adcx	9*8($tptr),%r9		# +=t[9]
   2761 	adcx	10*8($tptr),%r10	# ...
   2762 	adcx	11*8($tptr),%r11
   2763 	adc	12*8($tptr),%r12
   2764 	adc	13*8($tptr),%r13
   2765 	adc	14*8($tptr),%r14
   2766 	adc	15*8($tptr),%r15
   2767 	lea	($aptr),$aaptr
   2768 	lea	2*64($tptr),$tptr
   2769 	sbb	%rax,%rax		# mov %cf,$carry
   2770 
   2771 	mov	-64($aptr),%rdx		# a[0]
   2772 	mov	%rax,16+8(%rsp)		# offload $carry
   2773 	mov	$tptr,24+8(%rsp)
   2774 
   2775 	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
   2776 	xor	%eax,%eax		# cf=0, of=0
   2777 	jmp	.Lsqrx8x_loop
   2778 
   2779 .align	32
   2780 .Lsqrx8x_loop:
   2781 	mov	%r8,%rbx
   2782 	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
   2783 	adcx	%rax,%rbx		# +=t[8]
   2784 	adox	%r9,%r8
   2785 
   2786 	mulx	1*8($aaptr),%rax,%r9	# ...
   2787 	adcx	%rax,%r8
   2788 	adox	%r10,%r9
   2789 
   2790 	mulx	2*8($aaptr),%rax,%r10
   2791 	adcx	%rax,%r9
   2792 	adox	%r11,%r10
   2793 
   2794 	mulx	3*8($aaptr),%rax,%r11
   2795 	adcx	%rax,%r10
   2796 	adox	%r12,%r11
   2797 
   2798 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
   2799 	adcx	%rax,%r11
   2800 	adox	%r13,%r12
   2801 
   2802 	mulx	5*8($aaptr),%rax,%r13
   2803 	adcx	%rax,%r12
   2804 	adox	%r14,%r13
   2805 
   2806 	mulx	6*8($aaptr),%rax,%r14
   2807 	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
   2808 	 mov	\$0,%ebx
   2809 	adcx	%rax,%r13
   2810 	adox	%r15,%r14
   2811 
   2812 	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
   2813 	 mov	8($aptr,%rcx,8),%rdx	# a[i]
   2814 	adcx	%rax,%r14
   2815 	adox	%rbx,%r15		# %rbx is 0, of=0
   2816 	adcx	%rbx,%r15		# cf=0
   2817 
   2818 	.byte	0x67
   2819 	inc	%rcx			# of=0
   2820 	jnz	.Lsqrx8x_loop
   2821 
   2822 	lea	8*8($aaptr),$aaptr
   2823 	mov	\$-8,%rcx
   2824 	cmp	8+8(%rsp),$aaptr	# done?
   2825 	je	.Lsqrx8x_break
   2826 
   2827 	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
   2828 	.byte	0x66
   2829 	mov	-64($aptr),%rdx
   2830 	adcx	0*8($tptr),%r8
   2831 	adcx	1*8($tptr),%r9
   2832 	adc	2*8($tptr),%r10
   2833 	adc	3*8($tptr),%r11
   2834 	adc	4*8($tptr),%r12
   2835 	adc	5*8($tptr),%r13
   2836 	adc	6*8($tptr),%r14
   2837 	adc	7*8($tptr),%r15
   2838 	lea	8*8($tptr),$tptr
   2839 	.byte	0x67
   2840 	sbb	%rax,%rax		# mov %cf,%rax
   2841 	xor	%ebx,%ebx		# cf=0, of=0
   2842 	mov	%rax,16+8(%rsp)		# offload carry
   2843 	jmp	.Lsqrx8x_loop
   2844 
   2845 .align	32
   2846 .Lsqrx8x_break:
   2847 	sub	16+8(%rsp),%r8		# consume last carry
   2848 	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
   2849 	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
   2850 	xor	%ebp,%ebp		# xor	$zero,$zero
   2851 	mov	%r8,0*8($tptr)
   2852 	cmp	$carry,$tptr		# cf=0, of=0
   2853 	je	.Lsqrx8x_outer_loop
   2854 
   2855 	mov	%r9,1*8($tptr)
   2856 	 mov	1*8($carry),%r9
   2857 	mov	%r10,2*8($tptr)
   2858 	 mov	2*8($carry),%r10
   2859 	mov	%r11,3*8($tptr)
   2860 	 mov	3*8($carry),%r11
   2861 	mov	%r12,4*8($tptr)
   2862 	 mov	4*8($carry),%r12
   2863 	mov	%r13,5*8($tptr)
   2864 	 mov	5*8($carry),%r13
   2865 	mov	%r14,6*8($tptr)
   2866 	 mov	6*8($carry),%r14
   2867 	mov	%r15,7*8($tptr)
   2868 	 mov	7*8($carry),%r15
   2869 	mov	$carry,$tptr
   2870 	jmp	.Lsqrx8x_outer_loop
   2871 
   2872 .align	32
   2873 .Lsqrx8x_outer_break:
   2874 	mov	%r9,9*8($tptr)		# t[9]
   2875 	 movq	%xmm3,%rcx		# -$num
   2876 	mov	%r10,10*8($tptr)	# ...
   2877 	mov	%r11,11*8($tptr)
   2878 	mov	%r12,12*8($tptr)
   2879 	mov	%r13,13*8($tptr)
   2880 	mov	%r14,14*8($tptr)
   2881 ___
   2882 }{
   2884 my $i="%rcx";
   2885 $code.=<<___;
   2886 	lea	48+8(%rsp),$tptr
   2887 	mov	($aptr,$i),%rdx		# a[0]
   2888 
   2889 	mov	8($tptr),$A0[1]		# t[1]
   2890 	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
   2891 	mov	0+8(%rsp),$num		# restore $num
   2892 	adox	$A0[1],$A0[1]
   2893 	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
   2894 	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
   2895 	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
   2896 
   2897 .align	32
   2898 .Lsqrx4x_shift_n_add:
   2899 	mulx	%rdx,%rax,%rbx
   2900 	 adox	$A1[0],$A1[0]
   2901 	adcx	$A0[0],%rax
   2902 	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
   2903 	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
   2904 	 adox	$A1[1],$A1[1]
   2905 	adcx	$A0[1],%rbx
   2906 	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
   2907 	mov	%rax,0($tptr)
   2908 	mov	%rbx,8($tptr)
   2909 
   2910 	mulx	%rdx,%rax,%rbx
   2911 	 adox	$A0[0],$A0[0]
   2912 	adcx	$A1[0],%rax
   2913 	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
   2914 	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
   2915 	 adox	$A0[1],$A0[1]
   2916 	adcx	$A1[1],%rbx
   2917 	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
   2918 	mov	%rax,16($tptr)
   2919 	mov	%rbx,24($tptr)
   2920 
   2921 	mulx	%rdx,%rax,%rbx
   2922 	 adox	$A1[0],$A1[0]
   2923 	adcx	$A0[0],%rax
   2924 	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
   2925 	 lea	32($i),$i
   2926 	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
   2927 	 adox	$A1[1],$A1[1]
   2928 	adcx	$A0[1],%rbx
   2929 	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
   2930 	mov	%rax,32($tptr)
   2931 	mov	%rbx,40($tptr)
   2932 
   2933 	mulx	%rdx,%rax,%rbx
   2934 	 adox	$A0[0],$A0[0]
   2935 	adcx	$A1[0],%rax
   2936 	jrcxz	.Lsqrx4x_shift_n_add_break
   2937 	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
   2938 	 adox	$A0[1],$A0[1]
   2939 	adcx	$A1[1],%rbx
   2940 	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
   2941 	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
   2942 	mov	%rax,48($tptr)
   2943 	mov	%rbx,56($tptr)
   2944 	lea	64($tptr),$tptr
   2945 	nop
   2946 	jmp	.Lsqrx4x_shift_n_add
   2947 
   2948 .align	32
   2949 .Lsqrx4x_shift_n_add_break:
   2950 	adcx	$A1[1],%rbx
   2951 	mov	%rax,48($tptr)
   2952 	mov	%rbx,56($tptr)
   2953 	lea	64($tptr),$tptr		# end of t[] buffer
   2954 ___
   2955 }
   2957 ######################################################################
   2958 # Montgomery reduction part, "word-by-word" algorithm.
   2959 #
   2960 # This new path is inspired by multiple submissions from Intel, by
   2961 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   2962 # Vinodh Gopal...
   2963 {
   2964 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
   2965 
   2966 $code.=<<___;
   2967 	movq	%xmm2,$nptr
   2968 sqrx8x_reduction:
   2969 	xor	%eax,%eax		# initial top-most carry bit
   2970 	mov	32+8(%rsp),%rbx		# n0
   2971 	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
   2972 	lea	-128($nptr,$num,2),%rcx	# end of n[]
   2973 	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
   2974 	mov	%rcx, 0+8(%rsp)		# save end of n[]
   2975 	mov	$tptr,8+8(%rsp)		# save end of t[]
   2976 
   2977 	lea	48+8(%rsp),$tptr		# initial t[] window
   2978 	jmp	.Lsqrx8x_reduction_loop
   2979 
   2980 .align	32
   2981 .Lsqrx8x_reduction_loop:
   2982 	mov	8*1($tptr),%r9
   2983 	mov	8*2($tptr),%r10
   2984 	mov	8*3($tptr),%r11
   2985 	mov	8*4($tptr),%r12
   2986 	mov	%rdx,%r8
   2987 	imulq	%rbx,%rdx		# n0*a[i]
   2988 	mov	8*5($tptr),%r13
   2989 	mov	8*6($tptr),%r14
   2990 	mov	8*7($tptr),%r15
   2991 	mov	%rax,24+8(%rsp)		# store top-most carry bit
   2992 
   2993 	lea	8*8($tptr),$tptr
   2994 	xor	$carry,$carry		# cf=0,of=0
   2995 	mov	\$-8,%rcx
   2996 	jmp	.Lsqrx8x_reduce
   2997 
   2998 .align	32
   2999 .Lsqrx8x_reduce:
   3000 	mov	%r8, %rbx
   3001 	mulx	16*0($nptr),%rax,%r8	# n[0]
   3002 	adcx	%rbx,%rax		# discarded
   3003 	adox	%r9,%r8
   3004 
   3005 	mulx	16*1($nptr),%rbx,%r9	# n[1]
   3006 	adcx	%rbx,%r8
   3007 	adox	%r10,%r9
   3008 
   3009 	mulx	16*2($nptr),%rbx,%r10
   3010 	adcx	%rbx,%r9
   3011 	adox	%r11,%r10
   3012 
   3013 	mulx	16*3($nptr),%rbx,%r11
   3014 	adcx	%rbx,%r10
   3015 	adox	%r12,%r11
   3016 
   3017 	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
   3018 	 mov	%rdx,%rax
   3019 	 mov	%r8,%rdx
   3020 	adcx	%rbx,%r11
   3021 	adox	%r13,%r12
   3022 
   3023 	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
   3024 	 mov	%rax,%rdx
   3025 	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
   3026 
   3027 	mulx	16*5($nptr),%rax,%r13
   3028 	adcx	%rax,%r12
   3029 	adox	%r14,%r13
   3030 
   3031 	mulx	16*6($nptr),%rax,%r14
   3032 	adcx	%rax,%r13
   3033 	adox	%r15,%r14
   3034 
   3035 	mulx	16*7($nptr),%rax,%r15
   3036 	 mov	%rbx,%rdx
   3037 	adcx	%rax,%r14
   3038 	adox	$carry,%r15		# $carry is 0
   3039 	adcx	$carry,%r15		# cf=0
   3040 
   3041 	.byte	0x67,0x67,0x67
   3042 	inc	%rcx			# of=0
   3043 	jnz	.Lsqrx8x_reduce
   3044 
   3045 	mov	$carry,%rax		# xor	%rax,%rax
   3046 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3047 	jae	.Lsqrx8x_no_tail
   3048 
   3049 	mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3050 	add	8*0($tptr),%r8
   3051 	lea	16*8($nptr),$nptr
   3052 	mov	\$-8,%rcx
   3053 	adcx	8*1($tptr),%r9
   3054 	adcx	8*2($tptr),%r10
   3055 	adc	8*3($tptr),%r11
   3056 	adc	8*4($tptr),%r12
   3057 	adc	8*5($tptr),%r13
   3058 	adc	8*6($tptr),%r14
   3059 	adc	8*7($tptr),%r15
   3060 	lea	8*8($tptr),$tptr
   3061 	sbb	%rax,%rax		# top carry
   3062 
   3063 	xor	$carry,$carry		# of=0, cf=0
   3064 	mov	%rax,16+8(%rsp)
   3065 	jmp	.Lsqrx8x_tail
   3066 
   3067 .align	32
   3068 .Lsqrx8x_tail:
   3069 	mov	%r8,%rbx
   3070 	mulx	16*0($nptr),%rax,%r8
   3071 	adcx	%rax,%rbx
   3072 	adox	%r9,%r8
   3073 
   3074 	mulx	16*1($nptr),%rax,%r9
   3075 	adcx	%rax,%r8
   3076 	adox	%r10,%r9
   3077 
   3078 	mulx	16*2($nptr),%rax,%r10
   3079 	adcx	%rax,%r9
   3080 	adox	%r11,%r10
   3081 
   3082 	mulx	16*3($nptr),%rax,%r11
   3083 	adcx	%rax,%r10
   3084 	adox	%r12,%r11
   3085 
   3086 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
   3087 	adcx	%rax,%r11
   3088 	adox	%r13,%r12
   3089 
   3090 	mulx	16*5($nptr),%rax,%r13
   3091 	adcx	%rax,%r12
   3092 	adox	%r14,%r13
   3093 
   3094 	mulx	16*6($nptr),%rax,%r14
   3095 	adcx	%rax,%r13
   3096 	adox	%r15,%r14
   3097 
   3098 	mulx	16*7($nptr),%rax,%r15
   3099 	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
   3100 	adcx	%rax,%r14
   3101 	adox	$carry,%r15
   3102 	 mov	%rbx,($tptr,%rcx,8)	# save result
   3103 	 mov	%r8,%rbx
   3104 	adcx	$carry,%r15		# cf=0
   3105 
   3106 	inc	%rcx			# of=0
   3107 	jnz	.Lsqrx8x_tail
   3108 
   3109 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3110 	jae	.Lsqrx8x_tail_done	# break out of loop
   3111 
   3112 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3113 	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3114 	 lea	16*8($nptr),$nptr
   3115 	adc	8*0($tptr),%r8
   3116 	adc	8*1($tptr),%r9
   3117 	adc	8*2($tptr),%r10
   3118 	adc	8*3($tptr),%r11
   3119 	adc	8*4($tptr),%r12
   3120 	adc	8*5($tptr),%r13
   3121 	adc	8*6($tptr),%r14
   3122 	adc	8*7($tptr),%r15
   3123 	lea	8*8($tptr),$tptr
   3124 	sbb	%rax,%rax
   3125 	sub	\$8,%rcx		# mov	\$-8,%rcx
   3126 
   3127 	xor	$carry,$carry		# of=0, cf=0
   3128 	mov	%rax,16+8(%rsp)
   3129 	jmp	.Lsqrx8x_tail
   3130 
   3131 .align	32
   3132 .Lsqrx8x_tail_done:
   3133 	add	24+8(%rsp),%r8		# can this overflow?
   3134 	mov	$carry,%rax		# xor	%rax,%rax
   3135 
   3136 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3137 .Lsqrx8x_no_tail:			# %cf is 0 if jumped here
   3138 	adc	8*0($tptr),%r8
   3139 	 movq	%xmm3,%rcx
   3140 	adc	8*1($tptr),%r9
   3141 	 mov	16*7($nptr),$carry
   3142 	 movq	%xmm2,$nptr		# restore $nptr
   3143 	adc	8*2($tptr),%r10
   3144 	adc	8*3($tptr),%r11
   3145 	adc	8*4($tptr),%r12
   3146 	adc	8*5($tptr),%r13
   3147 	adc	8*6($tptr),%r14
   3148 	adc	8*7($tptr),%r15
   3149 	adc	%rax,%rax		# top-most carry
   3150 
   3151 	mov	32+8(%rsp),%rbx		# n0
   3152 	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
   3153 
   3154 	mov	%r8,8*0($tptr)		# store top 512 bits
   3155 	 lea	8*8($tptr),%r8		# borrow %r8
   3156 	mov	%r9,8*1($tptr)
   3157 	mov	%r10,8*2($tptr)
   3158 	mov	%r11,8*3($tptr)
   3159 	mov	%r12,8*4($tptr)
   3160 	mov	%r13,8*5($tptr)
   3161 	mov	%r14,8*6($tptr)
   3162 	mov	%r15,8*7($tptr)
   3163 
   3164 	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
   3165 	cmp	8+8(%rsp),%r8		# end of t[]?
   3166 	jb	.Lsqrx8x_reduction_loop
   3167 ___
   3168 }
   3170 ##############################################################
   3171 # Post-condition, 4x unrolled
   3172 #
   3173 {
   3174 my ($rptr,$nptr)=("%rdx","%rbp");
   3175 my @ri=map("%r$_",(10..13));
   3176 my @ni=map("%r$_",(14..15));
   3177 $code.=<<___;
   3178 	xor	%rbx,%rbx
   3179 	sub	%r15,%rsi		# compare top-most words
   3180 	adc	%rbx,%rbx
   3181 	mov	%rcx,%r10		# -$num
   3182 	.byte	0x67
   3183 	or	%rbx,%rax
   3184 	.byte	0x67
   3185 	mov	%rcx,%r9		# -$num
   3186 	xor	\$1,%rax
   3187 	sar	\$3+2,%rcx		# cf=0
   3188 	#lea	48+8(%rsp,%r9),$tptr
   3189 	lea	($nptr,%rax,8),$nptr
   3190 	movq	%xmm1,$rptr		# restore $rptr
   3191 	movq	%xmm1,$aptr		# prepare for back-to-back call
   3192 	jmp	.Lsqrx4x_sub
   3193 
   3194 .align	32
   3195 .Lsqrx4x_sub:
   3196 	.byte	0x66
   3197 	mov	8*0($tptr),%r12
   3198 	mov	8*1($tptr),%r13
   3199 	sbb	16*0($nptr),%r12
   3200 	mov	8*2($tptr),%r14
   3201 	sbb	16*1($nptr),%r13
   3202 	mov	8*3($tptr),%r15
   3203 	lea	8*4($tptr),$tptr
   3204 	sbb	16*2($nptr),%r14
   3205 	mov	%r12,8*0($rptr)
   3206 	sbb	16*3($nptr),%r15
   3207 	lea	16*4($nptr),$nptr
   3208 	mov	%r13,8*1($rptr)
   3209 	mov	%r14,8*2($rptr)
   3210 	mov	%r15,8*3($rptr)
   3211 	lea	8*4($rptr),$rptr
   3212 
   3213 	inc	%rcx
   3214 	jnz	.Lsqrx4x_sub
   3215 ___
   3216 }
   3217 $code.=<<___;
   3218 	neg	%r9			# restore $num
   3219 
   3220 	ret
   3221 .size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
   3222 ___
   3223 }}}
   3224 {
   3225 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
   3226 				("%rdi","%esi","%rdx","%ecx");  # Unix order
   3227 my $out=$inp;
   3228 my $STRIDE=2**5*8;
   3229 my $N=$STRIDE/4;
   3230 
   3231 $code.=<<___;
   3232 .globl	bn_scatter5
   3233 .type	bn_scatter5,\@abi-omnipotent
   3234 .align	16
   3235 bn_scatter5:
   3236 	cmp	\$0, $num
   3237 	jz	.Lscatter_epilogue
   3238 	lea	($tbl,$idx,8),$tbl
   3239 .Lscatter:
   3240 	mov	($inp),%rax
   3241 	lea	8($inp),$inp
   3242 	mov	%rax,($tbl)
   3243 	lea	32*8($tbl),$tbl
   3244 	sub	\$1,$num
   3245 	jnz	.Lscatter
   3246 .Lscatter_epilogue:
   3247 	ret
   3248 .size	bn_scatter5,.-bn_scatter5
   3249 
   3250 .globl	bn_gather5
   3251 .type	bn_gather5,\@abi-omnipotent
   3252 .align	16
   3253 bn_gather5:
   3254 ___
   3255 $code.=<<___ if ($win64);
   3256 .LSEH_begin_bn_gather5:
   3257 	# I can't trust assembler to use specific encoding:-(
   3258 	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
   3259 	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
   3260 	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
   3261 ___
   3262 $code.=<<___;
   3263 	mov	$idx,%r11d
   3264 	shr	\$`log($N/8)/log(2)`,$idx
   3265 	and	\$`$N/8-1`,%r11
   3266 	not	$idx
   3267 	lea	.Lmagic_masks(%rip),%rax
   3268 	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
   3269 	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
   3270 	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
   3271 	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
   3272 	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
   3273 	movq	24(%rax,$idx,8),%xmm7
   3274 	jmp	.Lgather
   3275 .align	16
   3276 .Lgather:
   3277 	movq	`0*$STRIDE/4-128`($tbl),%xmm0
   3278 	movq	`1*$STRIDE/4-128`($tbl),%xmm1
   3279 	pand	%xmm4,%xmm0
   3280 	movq	`2*$STRIDE/4-128`($tbl),%xmm2
   3281 	pand	%xmm5,%xmm1
   3282 	movq	`3*$STRIDE/4-128`($tbl),%xmm3
   3283 	pand	%xmm6,%xmm2
   3284 	por	%xmm1,%xmm0
   3285 	pand	%xmm7,%xmm3
   3286 	.byte	0x67,0x67
   3287 	por	%xmm2,%xmm0
   3288 	lea	$STRIDE($tbl),$tbl
   3289 	por	%xmm3,%xmm0
   3290 
   3291 	movq	%xmm0,($out)		# m0=bp[0]
   3292 	lea	8($out),$out
   3293 	sub	\$1,$num
   3294 	jnz	.Lgather
   3295 ___
   3296 $code.=<<___ if ($win64);
   3297 	movaps	(%rsp),%xmm6
   3298 	movaps	0x10(%rsp),%xmm7
   3299 	lea	0x28(%rsp),%rsp
   3300 ___
   3301 $code.=<<___;
   3302 	ret
   3303 .LSEH_end_bn_gather5:
   3304 .size	bn_gather5,.-bn_gather5
   3305 ___
   3306 }
   3307 $code.=<<___;
   3308 .align	64
   3309 .Lmagic_masks:
   3310 	.long	0,0, 0,0, 0,0, -1,-1
   3311 	.long	0,0, 0,0, 0,0,  0,0
   3312 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   3313 ___
   3314 
   3315 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   3316 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   3317 if ($win64) {
   3318 $rec="%rcx";
   3319 $frame="%rdx";
   3320 $context="%r8";
   3321 $disp="%r9";
   3322 
   3323 $code.=<<___;
   3324 .extern	__imp_RtlVirtualUnwind
   3325 .type	mul_handler,\@abi-omnipotent
   3326 .align	16
   3327 mul_handler:
   3328 	push	%rsi
   3329 	push	%rdi
   3330 	push	%rbx
   3331 	push	%rbp
   3332 	push	%r12
   3333 	push	%r13
   3334 	push	%r14
   3335 	push	%r15
   3336 	pushfq
   3337 	sub	\$64,%rsp
   3338 
   3339 	mov	120($context),%rax	# pull context->Rax
   3340 	mov	248($context),%rbx	# pull context->Rip
   3341 
   3342 	mov	8($disp),%rsi		# disp->ImageBase
   3343 	mov	56($disp),%r11		# disp->HandlerData
   3344 
   3345 	mov	0(%r11),%r10d		# HandlerData[0]
   3346 	lea	(%rsi,%r10),%r10	# end of prologue label
   3347 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   3348 	jb	.Lcommon_seh_tail
   3349 
   3350 	mov	152($context),%rax	# pull context->Rsp
   3351 
   3352 	mov	4(%r11),%r10d		# HandlerData[1]
   3353 	lea	(%rsi,%r10),%r10	# epilogue label
   3354 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   3355 	jae	.Lcommon_seh_tail
   3356 
   3357 	lea	.Lmul_epilogue(%rip),%r10
   3358 	cmp	%r10,%rbx
   3359 	jb	.Lbody_40
   3360 
   3361 	mov	192($context),%r10	# pull $num
   3362 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   3363 	jmp	.Lbody_proceed
   3364 
   3365 .Lbody_40:
   3366 	mov	40(%rax),%rax		# pull saved stack pointer
   3367 .Lbody_proceed:
   3368 
   3369 	movaps	-88(%rax),%xmm0
   3370 	movaps	-72(%rax),%xmm1
   3371 
   3372 	mov	-8(%rax),%rbx
   3373 	mov	-16(%rax),%rbp
   3374 	mov	-24(%rax),%r12
   3375 	mov	-32(%rax),%r13
   3376 	mov	-40(%rax),%r14
   3377 	mov	-48(%rax),%r15
   3378 	mov	%rbx,144($context)	# restore context->Rbx
   3379 	mov	%rbp,160($context)	# restore context->Rbp
   3380 	mov	%r12,216($context)	# restore context->R12
   3381 	mov	%r13,224($context)	# restore context->R13
   3382 	mov	%r14,232($context)	# restore context->R14
   3383 	mov	%r15,240($context)	# restore context->R15
   3384 	movups	%xmm0,512($context)	# restore context->Xmm6
   3385 	movups	%xmm1,528($context)	# restore context->Xmm7
   3386 
   3387 .Lcommon_seh_tail:
   3388 	mov	8(%rax),%rdi
   3389 	mov	16(%rax),%rsi
   3390 	mov	%rax,152($context)	# restore context->Rsp
   3391 	mov	%rsi,168($context)	# restore context->Rsi
   3392 	mov	%rdi,176($context)	# restore context->Rdi
   3393 
   3394 	mov	40($disp),%rdi		# disp->ContextRecord
   3395 	mov	$context,%rsi		# context
   3396 	mov	\$154,%ecx		# sizeof(CONTEXT)
   3397 	.long	0xa548f3fc		# cld; rep movsq
   3398 
   3399 	mov	$disp,%rsi
   3400 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   3401 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   3402 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   3403 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   3404 	mov	40(%rsi),%r10		# disp->ContextRecord
   3405 	lea	56(%rsi),%r11		# &disp->HandlerData
   3406 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   3407 	mov	%r10,32(%rsp)		# arg5
   3408 	mov	%r11,40(%rsp)		# arg6
   3409 	mov	%r12,48(%rsp)		# arg7
   3410 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   3411 	call	*__imp_RtlVirtualUnwind(%rip)
   3412 
   3413 	mov	\$1,%eax		# ExceptionContinueSearch
   3414 	add	\$64,%rsp
   3415 	popfq
   3416 	pop	%r15
   3417 	pop	%r14
   3418 	pop	%r13
   3419 	pop	%r12
   3420 	pop	%rbp
   3421 	pop	%rbx
   3422 	pop	%rdi
   3423 	pop	%rsi
   3424 	ret
   3425 .size	mul_handler,.-mul_handler
   3426 
   3427 .section	.pdata
   3428 .align	4
   3429 	.rva	.LSEH_begin_bn_mul_mont_gather5
   3430 	.rva	.LSEH_end_bn_mul_mont_gather5
   3431 	.rva	.LSEH_info_bn_mul_mont_gather5
   3432 
   3433 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
   3434 	.rva	.LSEH_end_bn_mul4x_mont_gather5
   3435 	.rva	.LSEH_info_bn_mul4x_mont_gather5
   3436 
   3437 	.rva	.LSEH_begin_bn_power5
   3438 	.rva	.LSEH_end_bn_power5
   3439 	.rva	.LSEH_info_bn_power5
   3440 
   3441 	.rva	.LSEH_begin_bn_from_mont8x
   3442 	.rva	.LSEH_end_bn_from_mont8x
   3443 	.rva	.LSEH_info_bn_from_mont8x
   3444 ___
   3445 $code.=<<___ if ($addx);
   3446 	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
   3447 	.rva	.LSEH_end_bn_mulx4x_mont_gather5
   3448 	.rva	.LSEH_info_bn_mulx4x_mont_gather5
   3449 
   3450 	.rva	.LSEH_begin_bn_powerx5
   3451 	.rva	.LSEH_end_bn_powerx5
   3452 	.rva	.LSEH_info_bn_powerx5
   3453 ___
   3454 $code.=<<___;
   3455 	.rva	.LSEH_begin_bn_gather5
   3456 	.rva	.LSEH_end_bn_gather5
   3457 	.rva	.LSEH_info_bn_gather5
   3458 
   3459 .section	.xdata
   3460 .align	8
   3461 .LSEH_info_bn_mul_mont_gather5:
   3462 	.byte	9,0,0,0
   3463 	.rva	mul_handler
   3464 	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
   3465 .align	8
   3466 .LSEH_info_bn_mul4x_mont_gather5:
   3467 	.byte	9,0,0,0
   3468 	.rva	mul_handler
   3469 	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
   3470 .align	8
   3471 .LSEH_info_bn_power5:
   3472 	.byte	9,0,0,0
   3473 	.rva	mul_handler
   3474 	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
   3475 .align	8
   3476 .LSEH_info_bn_from_mont8x:
   3477 	.byte	9,0,0,0
   3478 	.rva	mul_handler
   3479 	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
   3480 ___
   3481 $code.=<<___ if ($addx);
   3482 .align	8
   3483 .LSEH_info_bn_mulx4x_mont_gather5:
   3484 	.byte	9,0,0,0
   3485 	.rva	mul_handler
   3486 	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
   3487 .align	8
   3488 .LSEH_info_bn_powerx5:
   3489 	.byte	9,0,0,0
   3490 	.rva	mul_handler
   3491 	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
   3492 ___
   3493 $code.=<<___;
   3494 .align	8
   3495 .LSEH_info_bn_gather5:
   3496         .byte   0x01,0x0d,0x05,0x00
   3497         .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
   3498         .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
   3499         .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
   3500 .align	8
   3501 ___
   3502 }
   3503 
   3504 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3505 
   3506 print $code;
   3507 close STDOUT;
   3508