Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # August 2011.
     11 #
     12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
     13 # countermeasures. The subroutines are produced by replacing bp[i]
     14 # references in their x86_64-mont.pl counterparts with cache-neutral
     15 # references to powers table computed in BN_mod_exp_mont_consttime.
     16 # In addition subroutine that scatters elements of the powers table
     17 # is implemented, so that scatter-/gathering can be tuned without
     18 # bn_exp.c modifications.
     19 
     20 # August 2013.
     21 #
     22 # Add MULX/AD*X code paths and additional interfaces to optimize for
     23 # branch prediction unit. For input lengths that are multiples of 8
     24 # the np argument is not just modulus value, but one interleaved
     25 # with 0. This is to optimize post-condition...
     26 
     27 $flavour = shift;
     28 $output  = shift;
     29 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     30 
     31 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     32 
     33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     34 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     35 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     36 die "can't locate x86_64-xlate.pl";
     37 
     38 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     39 *STDOUT=*OUT;
     40 
     41 # In upstream, this is controlled by shelling out to the compiler to check
     42 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     43 # output, so this isn't useful anyway.
     44 #
     45 # TODO(davidben): Set $addx to one once build problems are resolved.
     46 $addx = 0;
     47 
     48 # int bn_mul_mont_gather5(
     49 $rp="%rdi";	# BN_ULONG *rp,
     50 $ap="%rsi";	# const BN_ULONG *ap,
     51 $bp="%rdx";	# const BN_ULONG *bp,
     52 $np="%rcx";	# const BN_ULONG *np,
     53 $n0="%r8";	# const BN_ULONG *n0,
     54 $num="%r9";	# int num,
     55 		# int idx);	# 0 to 2^5-1, "index" in $bp holding
     56 				# pre-computed powers of a', interlaced
     57 				# in such manner that b[0] is $bp[idx],
     58 				# b[1] is [2^5+idx], etc.
     59 $lo0="%r10";
     60 $hi0="%r11";
     61 $hi1="%r13";
     62 $i="%r14";
     63 $j="%r15";
     64 $m0="%rbx";
     65 $m1="%rbp";
     66 
     67 $code=<<___;
     68 .text
     69 
     70 .extern	OPENSSL_ia32cap_P
     71 
     72 .globl	bn_mul_mont_gather5
     73 .type	bn_mul_mont_gather5,\@function,6
     74 .align	64
     75 bn_mul_mont_gather5:
     76 .cfi_startproc
     77 	mov	${num}d,${num}d
     78 	mov	%rsp,%rax
     79 .cfi_def_cfa_register	%rax
     80 	test	\$7,${num}d
     81 	jnz	.Lmul_enter
     82 ___
     83 $code.=<<___ if ($addx);
     84 	leaq	OPENSSL_ia32cap_P(%rip),%r11
     85 	mov	8(%r11),%r11d
     86 ___
     87 $code.=<<___;
     88 	jmp	.Lmul4x_enter
     89 
     90 .align	16
     91 .Lmul_enter:
     92 	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
     93 	push	%rbx
     94 .cfi_push	%rbx
     95 	push	%rbp
     96 .cfi_push	%rbp
     97 	push	%r12
     98 .cfi_push	%r12
     99 	push	%r13
    100 .cfi_push	%r13
    101 	push	%r14
    102 .cfi_push	%r14
    103 	push	%r15
    104 .cfi_push	%r15
    105 
    106 	neg	$num
    107 	mov	%rsp,%r11
    108 	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
    109 	neg	$num			# restore $num
    110 	and	\$-1024,%r10		# minimize TLB usage
    111 
    112 	# An OS-agnostic version of __chkstk.
    113 	#
    114 	# Some OSes (Windows) insist on stack being "wired" to
    115 	# physical memory in strictly sequential manner, i.e. if stack
    116 	# allocation spans two pages, then reference to farmost one can
    117 	# be punishable by SEGV. But page walking can do good even on
    118 	# other OSes, because it guarantees that villain thread hits
    119 	# the guard page before it can make damage to innocent one...
    120 	sub	%r10,%r11
    121 	and	\$-4096,%r11
    122 	lea	(%r10,%r11),%rsp
    123 	mov	(%rsp),%r11
    124 	cmp	%r10,%rsp
    125 	ja	.Lmul_page_walk
    126 	jmp	.Lmul_page_walk_done
    127 
    128 .Lmul_page_walk:
    129 	lea	-4096(%rsp),%rsp
    130 	mov	(%rsp),%r11
    131 	cmp	%r10,%rsp
    132 	ja	.Lmul_page_walk
    133 .Lmul_page_walk_done:
    134 
    135 	lea	.Linc(%rip),%r10
    136 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    137 .cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
    138 .Lmul_body:
    139 
    140 	lea	128($bp),%r12		# reassign $bp (+size optimization)
    141 ___
    142 		$bp="%r12";
    143 		$STRIDE=2**5*8;		# 5 is "window size"
    144 		$N=$STRIDE/4;		# should match cache line size
    145 $code.=<<___;
    146 	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
    147 	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
    148 	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
    149 	and	\$-16,%r10
    150 
    151 	pshufd	\$0,%xmm5,%xmm5		# broadcast index
    152 	movdqa	%xmm1,%xmm4
    153 	movdqa	%xmm1,%xmm2
    154 ___
    155 ########################################################################
    156 # calculate mask by comparing 0..31 to index and save result to stack
    157 #
    158 $code.=<<___;
    159 	paddd	%xmm0,%xmm1
    160 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
    161 	.byte	0x67
    162 	movdqa	%xmm4,%xmm3
    163 ___
    164 for($k=0;$k<$STRIDE/16-4;$k+=4) {
    165 $code.=<<___;
    166 	paddd	%xmm1,%xmm2
    167 	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
    168 	movdqa	%xmm0,`16*($k+0)+112`(%r10)
    169 	movdqa	%xmm4,%xmm0
    170 
    171 	paddd	%xmm2,%xmm3
    172 	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
    173 	movdqa	%xmm1,`16*($k+1)+112`(%r10)
    174 	movdqa	%xmm4,%xmm1
    175 
    176 	paddd	%xmm3,%xmm0
    177 	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
    178 	movdqa	%xmm2,`16*($k+2)+112`(%r10)
    179 	movdqa	%xmm4,%xmm2
    180 
    181 	paddd	%xmm0,%xmm1
    182 	pcmpeqd	%xmm5,%xmm0
    183 	movdqa	%xmm3,`16*($k+3)+112`(%r10)
    184 	movdqa	%xmm4,%xmm3
    185 ___
    186 }
    187 $code.=<<___;				# last iteration can be optimized
    188 	paddd	%xmm1,%xmm2
    189 	pcmpeqd	%xmm5,%xmm1
    190 	movdqa	%xmm0,`16*($k+0)+112`(%r10)
    191 
    192 	paddd	%xmm2,%xmm3
    193 	.byte	0x67
    194 	pcmpeqd	%xmm5,%xmm2
    195 	movdqa	%xmm1,`16*($k+1)+112`(%r10)
    196 
    197 	pcmpeqd	%xmm5,%xmm3
    198 	movdqa	%xmm2,`16*($k+2)+112`(%r10)
    199 	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
    200 
    201 	pand	`16*($k+1)-128`($bp),%xmm1
    202 	pand	`16*($k+2)-128`($bp),%xmm2
    203 	movdqa	%xmm3,`16*($k+3)+112`(%r10)
    204 	pand	`16*($k+3)-128`($bp),%xmm3
    205 	por	%xmm2,%xmm0
    206 	por	%xmm3,%xmm1
    207 ___
    208 for($k=0;$k<$STRIDE/16-4;$k+=4) {
    209 $code.=<<___;
    210 	movdqa	`16*($k+0)-128`($bp),%xmm4
    211 	movdqa	`16*($k+1)-128`($bp),%xmm5
    212 	movdqa	`16*($k+2)-128`($bp),%xmm2
    213 	pand	`16*($k+0)+112`(%r10),%xmm4
    214 	movdqa	`16*($k+3)-128`($bp),%xmm3
    215 	pand	`16*($k+1)+112`(%r10),%xmm5
    216 	por	%xmm4,%xmm0
    217 	pand	`16*($k+2)+112`(%r10),%xmm2
    218 	por	%xmm5,%xmm1
    219 	pand	`16*($k+3)+112`(%r10),%xmm3
    220 	por	%xmm2,%xmm0
    221 	por	%xmm3,%xmm1
    222 ___
    223 }
    224 $code.=<<___;
    225 	por	%xmm1,%xmm0
    226 	pshufd	\$0x4e,%xmm0,%xmm1
    227 	por	%xmm1,%xmm0
    228 	lea	$STRIDE($bp),$bp
    229 	movq	%xmm0,$m0		# m0=bp[0]
    230 
    231 	mov	($n0),$n0		# pull n0[0] value
    232 	mov	($ap),%rax
    233 
    234 	xor	$i,$i			# i=0
    235 	xor	$j,$j			# j=0
    236 
    237 	mov	$n0,$m1
    238 	mulq	$m0			# ap[0]*bp[0]
    239 	mov	%rax,$lo0
    240 	mov	($np),%rax
    241 
    242 	imulq	$lo0,$m1		# "tp[0]"*n0
    243 	mov	%rdx,$hi0
    244 
    245 	mulq	$m1			# np[0]*m1
    246 	add	%rax,$lo0		# discarded
    247 	mov	8($ap),%rax
    248 	adc	\$0,%rdx
    249 	mov	%rdx,$hi1
    250 
    251 	lea	1($j),$j		# j++
    252 	jmp	.L1st_enter
    253 
    254 .align	16
    255 .L1st:
    256 	add	%rax,$hi1
    257 	mov	($ap,$j,8),%rax
    258 	adc	\$0,%rdx
    259 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    260 	mov	$lo0,$hi0
    261 	adc	\$0,%rdx
    262 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    263 	mov	%rdx,$hi1
    264 
    265 .L1st_enter:
    266 	mulq	$m0			# ap[j]*bp[0]
    267 	add	%rax,$hi0
    268 	mov	($np,$j,8),%rax
    269 	adc	\$0,%rdx
    270 	lea	1($j),$j		# j++
    271 	mov	%rdx,$lo0
    272 
    273 	mulq	$m1			# np[j]*m1
    274 	cmp	$num,$j
    275 	jne	.L1st			# note that upon exit $j==$num, so
    276 					# they can be used interchangeably
    277 
    278 	add	%rax,$hi1
    279 	adc	\$0,%rdx
    280 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    281 	adc	\$0,%rdx
    282 	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
    283 	mov	%rdx,$hi1
    284 	mov	$lo0,$hi0
    285 
    286 	xor	%rdx,%rdx
    287 	add	$hi0,$hi1
    288 	adc	\$0,%rdx
    289 	mov	$hi1,-8(%rsp,$num,8)
    290 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    291 
    292 	lea	1($i),$i		# i++
    293 	jmp	.Louter
    294 .align	16
    295 .Louter:
    296 	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
    297 	and	\$-16,%rdx
    298 	pxor	%xmm4,%xmm4
    299 	pxor	%xmm5,%xmm5
    300 ___
    301 for($k=0;$k<$STRIDE/16;$k+=4) {
    302 $code.=<<___;
    303 	movdqa	`16*($k+0)-128`($bp),%xmm0
    304 	movdqa	`16*($k+1)-128`($bp),%xmm1
    305 	movdqa	`16*($k+2)-128`($bp),%xmm2
    306 	movdqa	`16*($k+3)-128`($bp),%xmm3
    307 	pand	`16*($k+0)-128`(%rdx),%xmm0
    308 	pand	`16*($k+1)-128`(%rdx),%xmm1
    309 	por	%xmm0,%xmm4
    310 	pand	`16*($k+2)-128`(%rdx),%xmm2
    311 	por	%xmm1,%xmm5
    312 	pand	`16*($k+3)-128`(%rdx),%xmm3
    313 	por	%xmm2,%xmm4
    314 	por	%xmm3,%xmm5
    315 ___
    316 }
    317 $code.=<<___;
    318 	por	%xmm5,%xmm4
    319 	pshufd	\$0x4e,%xmm4,%xmm0
    320 	por	%xmm4,%xmm0
    321 	lea	$STRIDE($bp),$bp
    322 
    323 	mov	($ap),%rax		# ap[0]
    324 	movq	%xmm0,$m0		# m0=bp[i]
    325 
    326 	xor	$j,$j			# j=0
    327 	mov	$n0,$m1
    328 	mov	(%rsp),$lo0
    329 
    330 	mulq	$m0			# ap[0]*bp[i]
    331 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    332 	mov	($np),%rax
    333 	adc	\$0,%rdx
    334 
    335 	imulq	$lo0,$m1		# tp[0]*n0
    336 	mov	%rdx,$hi0
    337 
    338 	mulq	$m1			# np[0]*m1
    339 	add	%rax,$lo0		# discarded
    340 	mov	8($ap),%rax
    341 	adc	\$0,%rdx
    342 	mov	8(%rsp),$lo0		# tp[1]
    343 	mov	%rdx,$hi1
    344 
    345 	lea	1($j),$j		# j++
    346 	jmp	.Linner_enter
    347 
    348 .align	16
    349 .Linner:
    350 	add	%rax,$hi1
    351 	mov	($ap,$j,8),%rax
    352 	adc	\$0,%rdx
    353 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    354 	mov	(%rsp,$j,8),$lo0
    355 	adc	\$0,%rdx
    356 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    357 	mov	%rdx,$hi1
    358 
    359 .Linner_enter:
    360 	mulq	$m0			# ap[j]*bp[i]
    361 	add	%rax,$hi0
    362 	mov	($np,$j,8),%rax
    363 	adc	\$0,%rdx
    364 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    365 	mov	%rdx,$hi0
    366 	adc	\$0,$hi0
    367 	lea	1($j),$j		# j++
    368 
    369 	mulq	$m1			# np[j]*m1
    370 	cmp	$num,$j
    371 	jne	.Linner			# note that upon exit $j==$num, so
    372 					# they can be used interchangeably
    373 	add	%rax,$hi1
    374 	adc	\$0,%rdx
    375 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    376 	mov	(%rsp,$num,8),$lo0
    377 	adc	\$0,%rdx
    378 	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
    379 	mov	%rdx,$hi1
    380 
    381 	xor	%rdx,%rdx
    382 	add	$hi0,$hi1
    383 	adc	\$0,%rdx
    384 	add	$lo0,$hi1		# pull upmost overflow bit
    385 	adc	\$0,%rdx
    386 	mov	$hi1,-8(%rsp,$num,8)
    387 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    388 
    389 	lea	1($i),$i		# i++
    390 	cmp	$num,$i
    391 	jb	.Louter
    392 
    393 	xor	$i,$i			# i=0 and clear CF!
    394 	mov	(%rsp),%rax		# tp[0]
    395 	lea	(%rsp),$ap		# borrow ap for tp
    396 	mov	$num,$j			# j=num
    397 	jmp	.Lsub
    398 .align	16
    399 .Lsub:
    400 	sbb	($np,$i,8),%rax
    401 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    402 	mov	8($ap,$i,8),%rax	# tp[i+1]
    403 	lea	1($i),$i		# i++
    404 	dec	$j			# doesnn't affect CF!
    405 	jnz	.Lsub
    406 
    407 	sbb	\$0,%rax		# handle upmost overflow bit
    408 	xor	$i,$i
    409 	and	%rax,$ap
    410 	not	%rax
    411 	mov	$rp,$np
    412 	and	%rax,$np
    413 	mov	$num,$j			# j=num
    414 	or	$np,$ap			# ap=borrow?tp:rp
    415 .align	16
    416 .Lcopy:					# copy or in-place refresh
    417 	mov	($ap,$i,8),%rax
    418 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    419 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    420 	lea	1($i),$i
    421 	sub	\$1,$j
    422 	jnz	.Lcopy
    423 
    424 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    425 .cfi_def_cfa	%rsi,8
    426 	mov	\$1,%rax
    427 
    428 	mov	-48(%rsi),%r15
    429 .cfi_restore	%r15
    430 	mov	-40(%rsi),%r14
    431 .cfi_restore	%r14
    432 	mov	-32(%rsi),%r13
    433 .cfi_restore	%r13
    434 	mov	-24(%rsi),%r12
    435 .cfi_restore	%r12
    436 	mov	-16(%rsi),%rbp
    437 .cfi_restore	%rbp
    438 	mov	-8(%rsi),%rbx
    439 .cfi_restore	%rbx
    440 	lea	(%rsi),%rsp
    441 .cfi_def_cfa_register	%rsp
    442 .Lmul_epilogue:
    443 	ret
    444 .cfi_endproc
    445 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    446 ___
    447 {{{
    448 my @A=("%r10","%r11");
    449 my @N=("%r13","%rdi");
    450 $code.=<<___;
    451 .type	bn_mul4x_mont_gather5,\@function,6
    452 .align	32
    453 bn_mul4x_mont_gather5:
    454 .cfi_startproc
    455 	.byte	0x67
    456 	mov	%rsp,%rax
    457 .cfi_def_cfa_register	%rax
    458 .Lmul4x_enter:
    459 ___
    460 $code.=<<___ if ($addx);
    461 	and	\$0x80108,%r11d
    462 	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
    463 	je	.Lmulx4x_enter
    464 ___
    465 $code.=<<___;
    466 	push	%rbx
    467 .cfi_push	%rbx
    468 	push	%rbp
    469 .cfi_push	%rbp
    470 	push	%r12
    471 .cfi_push	%r12
    472 	push	%r13
    473 .cfi_push	%r13
    474 	push	%r14
    475 .cfi_push	%r14
    476 	push	%r15
    477 .cfi_push	%r15
    478 .Lmul4x_prologue:
    479 
    480 	.byte	0x67
    481 	shl	\$3,${num}d		# convert $num to bytes
    482 	lea	($num,$num,2),%r10	# 3*$num in bytes
    483 	neg	$num			# -$num
    484 
    485 	##############################################################
    486 	# Ensure that stack frame doesn't alias with $rptr+3*$num
    487 	# modulo 4096, which covers ret[num], am[num] and n[num]
    488 	# (see bn_exp.c). This is done to allow memory disambiguation
    489 	# logic do its magic. [Extra [num] is allocated in order
    490 	# to align with bn_power5's frame, which is cleansed after
    491 	# completing exponentiation. Extra 256 bytes is for power mask
    492 	# calculated from 7th argument, the index.]
    493 	#
    494 	lea	-320(%rsp,$num,2),%r11
    495 	mov	%rsp,%rbp
    496 	sub	$rp,%r11
    497 	and	\$4095,%r11
    498 	cmp	%r11,%r10
    499 	jb	.Lmul4xsp_alt
    500 	sub	%r11,%rbp		# align with $rp
    501 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
    502 	jmp	.Lmul4xsp_done
    503 
    504 .align	32
    505 .Lmul4xsp_alt:
    506 	lea	4096-320(,$num,2),%r10
    507 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
    508 	sub	%r10,%r11
    509 	mov	\$0,%r10
    510 	cmovc	%r10,%r11
    511 	sub	%r11,%rbp
    512 .Lmul4xsp_done:
    513 	and	\$-64,%rbp
    514 	mov	%rsp,%r11
    515 	sub	%rbp,%r11
    516 	and	\$-4096,%r11
    517 	lea	(%rbp,%r11),%rsp
    518 	mov	(%rsp),%r10
    519 	cmp	%rbp,%rsp
    520 	ja	.Lmul4x_page_walk
    521 	jmp	.Lmul4x_page_walk_done
    522 
    523 .Lmul4x_page_walk:
    524 	lea	-4096(%rsp),%rsp
    525 	mov	(%rsp),%r10
    526 	cmp	%rbp,%rsp
    527 	ja	.Lmul4x_page_walk
    528 .Lmul4x_page_walk_done:
    529 
    530 	neg	$num
    531 
    532 	mov	%rax,40(%rsp)
    533 .cfi_cfa_expression	%rsp+40,deref,+8
    534 .Lmul4x_body:
    535 
    536 	call	mul4x_internal
    537 
    538 	mov	40(%rsp),%rsi		# restore %rsp
    539 .cfi_def_cfa	%rsi,8
    540 	mov	\$1,%rax
    541 
    542 	mov	-48(%rsi),%r15
    543 .cfi_restore	%r15
    544 	mov	-40(%rsi),%r14
    545 .cfi_restore	%r14
    546 	mov	-32(%rsi),%r13
    547 .cfi_restore	%r13
    548 	mov	-24(%rsi),%r12
    549 .cfi_restore	%r12
    550 	mov	-16(%rsi),%rbp
    551 .cfi_restore	%rbp
    552 	mov	-8(%rsi),%rbx
    553 .cfi_restore	%rbx
    554 	lea	(%rsi),%rsp
    555 .cfi_def_cfa_register	%rsp
    556 .Lmul4x_epilogue:
    557 	ret
    558 .cfi_endproc
    559 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    560 
    561 .type	mul4x_internal,\@abi-omnipotent
    562 .align	32
    563 mul4x_internal:
    564 	shl	\$5,$num		# $num was in bytes
    565 	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
    566 	lea	.Linc(%rip),%rax
    567 	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
    568 	shr	\$5,$num		# restore $num
    569 ___
    570 		$bp="%r12";
    571 		$STRIDE=2**5*8;		# 5 is "window size"
    572 		$N=$STRIDE/4;		# should match cache line size
    573 		$tp=$i;
    574 $code.=<<___;
    575 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
    576 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
    577 	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
    578 	lea	128(%rdx),$bp		# size optimization
    579 
    580 	pshufd	\$0,%xmm5,%xmm5		# broadcast index
    581 	movdqa	%xmm1,%xmm4
    582 	.byte	0x67,0x67
    583 	movdqa	%xmm1,%xmm2
    584 ___
    585 ########################################################################
    586 # calculate mask by comparing 0..31 to index and save result to stack
    587 #
    588 $code.=<<___;
    589 	paddd	%xmm0,%xmm1
    590 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
    591 	.byte	0x67
    592 	movdqa	%xmm4,%xmm3
    593 ___
    594 for($i=0;$i<$STRIDE/16-4;$i+=4) {
    595 $code.=<<___;
    596 	paddd	%xmm1,%xmm2
    597 	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
    598 	movdqa	%xmm0,`16*($i+0)+112`(%r10)
    599 	movdqa	%xmm4,%xmm0
    600 
    601 	paddd	%xmm2,%xmm3
    602 	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
    603 	movdqa	%xmm1,`16*($i+1)+112`(%r10)
    604 	movdqa	%xmm4,%xmm1
    605 
    606 	paddd	%xmm3,%xmm0
    607 	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
    608 	movdqa	%xmm2,`16*($i+2)+112`(%r10)
    609 	movdqa	%xmm4,%xmm2
    610 
    611 	paddd	%xmm0,%xmm1
    612 	pcmpeqd	%xmm5,%xmm0
    613 	movdqa	%xmm3,`16*($i+3)+112`(%r10)
    614 	movdqa	%xmm4,%xmm3
    615 ___
    616 }
    617 $code.=<<___;				# last iteration can be optimized
    618 	paddd	%xmm1,%xmm2
    619 	pcmpeqd	%xmm5,%xmm1
    620 	movdqa	%xmm0,`16*($i+0)+112`(%r10)
    621 
    622 	paddd	%xmm2,%xmm3
    623 	.byte	0x67
    624 	pcmpeqd	%xmm5,%xmm2
    625 	movdqa	%xmm1,`16*($i+1)+112`(%r10)
    626 
    627 	pcmpeqd	%xmm5,%xmm3
    628 	movdqa	%xmm2,`16*($i+2)+112`(%r10)
    629 	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
    630 
    631 	pand	`16*($i+1)-128`($bp),%xmm1
    632 	pand	`16*($i+2)-128`($bp),%xmm2
    633 	movdqa	%xmm3,`16*($i+3)+112`(%r10)
    634 	pand	`16*($i+3)-128`($bp),%xmm3
    635 	por	%xmm2,%xmm0
    636 	por	%xmm3,%xmm1
    637 ___
    638 for($i=0;$i<$STRIDE/16-4;$i+=4) {
    639 $code.=<<___;
    640 	movdqa	`16*($i+0)-128`($bp),%xmm4
    641 	movdqa	`16*($i+1)-128`($bp),%xmm5
    642 	movdqa	`16*($i+2)-128`($bp),%xmm2
    643 	pand	`16*($i+0)+112`(%r10),%xmm4
    644 	movdqa	`16*($i+3)-128`($bp),%xmm3
    645 	pand	`16*($i+1)+112`(%r10),%xmm5
    646 	por	%xmm4,%xmm0
    647 	pand	`16*($i+2)+112`(%r10),%xmm2
    648 	por	%xmm5,%xmm1
    649 	pand	`16*($i+3)+112`(%r10),%xmm3
    650 	por	%xmm2,%xmm0
    651 	por	%xmm3,%xmm1
    652 ___
    653 }
    654 $code.=<<___;
    655 	por	%xmm1,%xmm0
    656 	pshufd	\$0x4e,%xmm0,%xmm1
    657 	por	%xmm1,%xmm0
    658 	lea	$STRIDE($bp),$bp
    659 	movq	%xmm0,$m0		# m0=bp[0]
    660 
    661 	mov	%r13,16+8(%rsp)		# save end of b[num]
    662 	mov	$rp, 56+8(%rsp)		# save $rp
    663 
    664 	mov	($n0),$n0		# pull n0[0] value
    665 	mov	($ap),%rax
    666 	lea	($ap,$num),$ap		# end of a[num]
    667 	neg	$num
    668 
    669 	mov	$n0,$m1
    670 	mulq	$m0			# ap[0]*bp[0]
    671 	mov	%rax,$A[0]
    672 	mov	($np),%rax
    673 
    674 	imulq	$A[0],$m1		# "tp[0]"*n0
    675 	lea	64+8(%rsp),$tp
    676 	mov	%rdx,$A[1]
    677 
    678 	mulq	$m1			# np[0]*m1
    679 	add	%rax,$A[0]		# discarded
    680 	mov	8($ap,$num),%rax
    681 	adc	\$0,%rdx
    682 	mov	%rdx,$N[1]
    683 
    684 	mulq	$m0
    685 	add	%rax,$A[1]
    686 	mov	8*1($np),%rax
    687 	adc	\$0,%rdx
    688 	mov	%rdx,$A[0]
    689 
    690 	mulq	$m1
    691 	add	%rax,$N[1]
    692 	mov	16($ap,$num),%rax
    693 	adc	\$0,%rdx
    694 	add	$A[1],$N[1]
    695 	lea	4*8($num),$j		# j=4
    696 	lea	8*4($np),$np
    697 	adc	\$0,%rdx
    698 	mov	$N[1],($tp)
    699 	mov	%rdx,$N[0]
    700 	jmp	.L1st4x
    701 
    702 .align	32
    703 .L1st4x:
    704 	mulq	$m0			# ap[j]*bp[0]
    705 	add	%rax,$A[0]
    706 	mov	-8*2($np),%rax
    707 	lea	32($tp),$tp
    708 	adc	\$0,%rdx
    709 	mov	%rdx,$A[1]
    710 
    711 	mulq	$m1			# np[j]*m1
    712 	add	%rax,$N[0]
    713 	mov	-8($ap,$j),%rax
    714 	adc	\$0,%rdx
    715 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    716 	adc	\$0,%rdx
    717 	mov	$N[0],-24($tp)		# tp[j-1]
    718 	mov	%rdx,$N[1]
    719 
    720 	mulq	$m0			# ap[j]*bp[0]
    721 	add	%rax,$A[1]
    722 	mov	-8*1($np),%rax
    723 	adc	\$0,%rdx
    724 	mov	%rdx,$A[0]
    725 
    726 	mulq	$m1			# np[j]*m1
    727 	add	%rax,$N[1]
    728 	mov	($ap,$j),%rax
    729 	adc	\$0,%rdx
    730 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    731 	adc	\$0,%rdx
    732 	mov	$N[1],-16($tp)		# tp[j-1]
    733 	mov	%rdx,$N[0]
    734 
    735 	mulq	$m0			# ap[j]*bp[0]
    736 	add	%rax,$A[0]
    737 	mov	8*0($np),%rax
    738 	adc	\$0,%rdx
    739 	mov	%rdx,$A[1]
    740 
    741 	mulq	$m1			# np[j]*m1
    742 	add	%rax,$N[0]
    743 	mov	8($ap,$j),%rax
    744 	adc	\$0,%rdx
    745 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    746 	adc	\$0,%rdx
    747 	mov	$N[0],-8($tp)		# tp[j-1]
    748 	mov	%rdx,$N[1]
    749 
    750 	mulq	$m0			# ap[j]*bp[0]
    751 	add	%rax,$A[1]
    752 	mov	8*1($np),%rax
    753 	adc	\$0,%rdx
    754 	mov	%rdx,$A[0]
    755 
    756 	mulq	$m1			# np[j]*m1
    757 	add	%rax,$N[1]
    758 	mov	16($ap,$j),%rax
    759 	adc	\$0,%rdx
    760 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    761 	lea	8*4($np),$np
    762 	adc	\$0,%rdx
    763 	mov	$N[1],($tp)		# tp[j-1]
    764 	mov	%rdx,$N[0]
    765 
    766 	add	\$32,$j			# j+=4
    767 	jnz	.L1st4x
    768 
    769 	mulq	$m0			# ap[j]*bp[0]
    770 	add	%rax,$A[0]
    771 	mov	-8*2($np),%rax
    772 	lea	32($tp),$tp
    773 	adc	\$0,%rdx
    774 	mov	%rdx,$A[1]
    775 
    776 	mulq	$m1			# np[j]*m1
    777 	add	%rax,$N[0]
    778 	mov	-8($ap),%rax
    779 	adc	\$0,%rdx
    780 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    781 	adc	\$0,%rdx
    782 	mov	$N[0],-24($tp)		# tp[j-1]
    783 	mov	%rdx,$N[1]
    784 
    785 	mulq	$m0			# ap[j]*bp[0]
    786 	add	%rax,$A[1]
    787 	mov	-8*1($np),%rax
    788 	adc	\$0,%rdx
    789 	mov	%rdx,$A[0]
    790 
    791 	mulq	$m1			# np[j]*m1
    792 	add	%rax,$N[1]
    793 	mov	($ap,$num),%rax		# ap[0]
    794 	adc	\$0,%rdx
    795 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    796 	adc	\$0,%rdx
    797 	mov	$N[1],-16($tp)		# tp[j-1]
    798 	mov	%rdx,$N[0]
    799 
    800 	lea	($np,$num),$np		# rewind $np
    801 
    802 	xor	$N[1],$N[1]
    803 	add	$A[0],$N[0]
    804 	adc	\$0,$N[1]
    805 	mov	$N[0],-8($tp)
    806 
    807 	jmp	.Louter4x
    808 
    809 .align	32
    810 .Louter4x:
    811 	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
    812 	pxor	%xmm4,%xmm4
    813 	pxor	%xmm5,%xmm5
    814 ___
    815 for($i=0;$i<$STRIDE/16;$i+=4) {
    816 $code.=<<___;
    817 	movdqa	`16*($i+0)-128`($bp),%xmm0
    818 	movdqa	`16*($i+1)-128`($bp),%xmm1
    819 	movdqa	`16*($i+2)-128`($bp),%xmm2
    820 	movdqa	`16*($i+3)-128`($bp),%xmm3
    821 	pand	`16*($i+0)-128`(%rdx),%xmm0
    822 	pand	`16*($i+1)-128`(%rdx),%xmm1
    823 	por	%xmm0,%xmm4
    824 	pand	`16*($i+2)-128`(%rdx),%xmm2
    825 	por	%xmm1,%xmm5
    826 	pand	`16*($i+3)-128`(%rdx),%xmm3
    827 	por	%xmm2,%xmm4
    828 	por	%xmm3,%xmm5
    829 ___
    830 }
    831 $code.=<<___;
    832 	por	%xmm5,%xmm4
    833 	pshufd	\$0x4e,%xmm4,%xmm0
    834 	por	%xmm4,%xmm0
    835 	lea	$STRIDE($bp),$bp
    836 	movq	%xmm0,$m0		# m0=bp[i]
    837 
    838 	mov	($tp,$num),$A[0]
    839 	mov	$n0,$m1
    840 	mulq	$m0			# ap[0]*bp[i]
    841 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    842 	mov	($np),%rax
    843 	adc	\$0,%rdx
    844 
    845 	imulq	$A[0],$m1		# tp[0]*n0
    846 	mov	%rdx,$A[1]
    847 	mov	$N[1],($tp)		# store upmost overflow bit
    848 
    849 	lea	($tp,$num),$tp		# rewind $tp
    850 
    851 	mulq	$m1			# np[0]*m1
    852 	add	%rax,$A[0]		# "$N[0]", discarded
    853 	mov	8($ap,$num),%rax
    854 	adc	\$0,%rdx
    855 	mov	%rdx,$N[1]
    856 
    857 	mulq	$m0			# ap[j]*bp[i]
    858 	add	%rax,$A[1]
    859 	mov	8*1($np),%rax
    860 	adc	\$0,%rdx
    861 	add	8($tp),$A[1]		# +tp[1]
    862 	adc	\$0,%rdx
    863 	mov	%rdx,$A[0]
    864 
    865 	mulq	$m1			# np[j]*m1
    866 	add	%rax,$N[1]
    867 	mov	16($ap,$num),%rax
    868 	adc	\$0,%rdx
    869 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    870 	lea	4*8($num),$j		# j=4
    871 	lea	8*4($np),$np
    872 	adc	\$0,%rdx
    873 	mov	%rdx,$N[0]
    874 	jmp	.Linner4x
    875 
    876 .align	32
    877 .Linner4x:
    878 	mulq	$m0			# ap[j]*bp[i]
    879 	add	%rax,$A[0]
    880 	mov	-8*2($np),%rax
    881 	adc	\$0,%rdx
    882 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    883 	lea	32($tp),$tp
    884 	adc	\$0,%rdx
    885 	mov	%rdx,$A[1]
    886 
    887 	mulq	$m1			# np[j]*m1
    888 	add	%rax,$N[0]
    889 	mov	-8($ap,$j),%rax
    890 	adc	\$0,%rdx
    891 	add	$A[0],$N[0]
    892 	adc	\$0,%rdx
    893 	mov	$N[1],-32($tp)		# tp[j-1]
    894 	mov	%rdx,$N[1]
    895 
    896 	mulq	$m0			# ap[j]*bp[i]
    897 	add	%rax,$A[1]
    898 	mov	-8*1($np),%rax
    899 	adc	\$0,%rdx
    900 	add	-8($tp),$A[1]
    901 	adc	\$0,%rdx
    902 	mov	%rdx,$A[0]
    903 
    904 	mulq	$m1			# np[j]*m1
    905 	add	%rax,$N[1]
    906 	mov	($ap,$j),%rax
    907 	adc	\$0,%rdx
    908 	add	$A[1],$N[1]
    909 	adc	\$0,%rdx
    910 	mov	$N[0],-24($tp)		# tp[j-1]
    911 	mov	%rdx,$N[0]
    912 
    913 	mulq	$m0			# ap[j]*bp[i]
    914 	add	%rax,$A[0]
    915 	mov	8*0($np),%rax
    916 	adc	\$0,%rdx
    917 	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    918 	adc	\$0,%rdx
    919 	mov	%rdx,$A[1]
    920 
    921 	mulq	$m1			# np[j]*m1
    922 	add	%rax,$N[0]
    923 	mov	8($ap,$j),%rax
    924 	adc	\$0,%rdx
    925 	add	$A[0],$N[0]
    926 	adc	\$0,%rdx
    927 	mov	$N[1],-16($tp)		# tp[j-1]
    928 	mov	%rdx,$N[1]
    929 
    930 	mulq	$m0			# ap[j]*bp[i]
    931 	add	%rax,$A[1]
    932 	mov	8*1($np),%rax
    933 	adc	\$0,%rdx
    934 	add	8($tp),$A[1]
    935 	adc	\$0,%rdx
    936 	mov	%rdx,$A[0]
    937 
    938 	mulq	$m1			# np[j]*m1
    939 	add	%rax,$N[1]
    940 	mov	16($ap,$j),%rax
    941 	adc	\$0,%rdx
    942 	add	$A[1],$N[1]
    943 	lea	8*4($np),$np
    944 	adc	\$0,%rdx
    945 	mov	$N[0],-8($tp)		# tp[j-1]
    946 	mov	%rdx,$N[0]
    947 
    948 	add	\$32,$j			# j+=4
    949 	jnz	.Linner4x
    950 
    951 	mulq	$m0			# ap[j]*bp[i]
    952 	add	%rax,$A[0]
    953 	mov	-8*2($np),%rax
    954 	adc	\$0,%rdx
    955 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
    956 	lea	32($tp),$tp
    957 	adc	\$0,%rdx
    958 	mov	%rdx,$A[1]
    959 
    960 	mulq	$m1			# np[j]*m1
    961 	add	%rax,$N[0]
    962 	mov	-8($ap),%rax
    963 	adc	\$0,%rdx
    964 	add	$A[0],$N[0]
    965 	adc	\$0,%rdx
    966 	mov	$N[1],-32($tp)		# tp[j-1]
    967 	mov	%rdx,$N[1]
    968 
    969 	mulq	$m0			# ap[j]*bp[i]
    970 	add	%rax,$A[1]
    971 	mov	$m1,%rax
    972 	mov	-8*1($np),$m1
    973 	adc	\$0,%rdx
    974 	add	-8($tp),$A[1]
    975 	adc	\$0,%rdx
    976 	mov	%rdx,$A[0]
    977 
    978 	mulq	$m1			# np[j]*m1
    979 	add	%rax,$N[1]
    980 	mov	($ap,$num),%rax		# ap[0]
    981 	adc	\$0,%rdx
    982 	add	$A[1],$N[1]
    983 	adc	\$0,%rdx
    984 	mov	$N[0],-24($tp)		# tp[j-1]
    985 	mov	%rdx,$N[0]
    986 
    987 	mov	$N[1],-16($tp)		# tp[j-1]
    988 	lea	($np,$num),$np		# rewind $np
    989 
    990 	xor	$N[1],$N[1]
    991 	add	$A[0],$N[0]
    992 	adc	\$0,$N[1]
    993 	add	($tp),$N[0]		# pull upmost overflow bit
    994 	adc	\$0,$N[1]		# upmost overflow bit
    995 	mov	$N[0],-8($tp)
    996 
    997 	cmp	16+8(%rsp),$bp
    998 	jb	.Louter4x
    999 ___
   1000 if (1) {
   1001 $code.=<<___;
   1002 	xor	%rax,%rax
   1003 	sub	$N[0],$m1		# compare top-most words
   1004 	adc	$j,$j			# $j is zero
   1005 	or	$j,$N[1]
   1006 	sub	$N[1],%rax		# %rax=-$N[1]
   1007 	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
   1008 	mov	($np),%r12
   1009 	lea	($np),%rbp		# nptr in .sqr4x_sub
   1010 	mov	%r9,%rcx
   1011 	sar	\$3+2,%rcx
   1012 	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
   1013 	dec	%r12			# so that after 'not' we get -n[0]
   1014 	xor	%r10,%r10
   1015 	mov	8*1(%rbp),%r13
   1016 	mov	8*2(%rbp),%r14
   1017 	mov	8*3(%rbp),%r15
   1018 	jmp	.Lsqr4x_sub_entry
   1019 ___
   1020 } else {
   1021 my @ri=("%rax",$bp,$m0,$m1);
   1022 my $rp="%rdx";
   1023 $code.=<<___
   1024 	xor	\$1,$N[1]
   1025 	lea	($tp,$num),$tp		# rewind $tp
   1026 	sar	\$5,$num		# cf=0
   1027 	lea	($np,$N[1],8),$np
   1028 	mov	56+8(%rsp),$rp		# restore $rp
   1029 	jmp	.Lsub4x
   1030 
   1031 .align	32
   1032 .Lsub4x:
   1033 	.byte	0x66
   1034 	mov	8*0($tp),@ri[0]
   1035 	mov	8*1($tp),@ri[1]
   1036 	.byte	0x66
   1037 	sbb	16*0($np),@ri[0]
   1038 	mov	8*2($tp),@ri[2]
   1039 	sbb	16*1($np),@ri[1]
   1040 	mov	3*8($tp),@ri[3]
   1041 	lea	4*8($tp),$tp
   1042 	sbb	16*2($np),@ri[2]
   1043 	mov	@ri[0],8*0($rp)
   1044 	sbb	16*3($np),@ri[3]
   1045 	lea	16*4($np),$np
   1046 	mov	@ri[1],8*1($rp)
   1047 	mov	@ri[2],8*2($rp)
   1048 	mov	@ri[3],8*3($rp)
   1049 	lea	8*4($rp),$rp
   1050 
   1051 	inc	$num
   1052 	jnz	.Lsub4x
   1053 
   1054 	ret
   1055 ___
   1056 }
   1057 $code.=<<___;
   1058 .size	mul4x_internal,.-mul4x_internal
   1059 ___
   1060 }}}
   1061 {{{
   1063 ######################################################################
   1064 # void bn_power5(
   1065 my $rptr="%rdi";	# BN_ULONG *rptr,
   1066 my $aptr="%rsi";	# const BN_ULONG *aptr,
   1067 my $bptr="%rdx";	# const void *table,
   1068 my $nptr="%rcx";	# const BN_ULONG *nptr,
   1069 my $n0  ="%r8";		# const BN_ULONG *n0);
   1070 my $num ="%r9";		# int num, has to be divisible by 8
   1071 			# int pwr
   1072 
   1073 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
   1074 my @A0=("%r10","%r11");
   1075 my @A1=("%r12","%r13");
   1076 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
   1077 
   1078 $code.=<<___;
   1079 .globl	bn_power5
   1080 .type	bn_power5,\@function,6
   1081 .align	32
   1082 bn_power5:
   1083 .cfi_startproc
   1084 	mov	%rsp,%rax
   1085 .cfi_def_cfa_register	%rax
   1086 ___
   1087 $code.=<<___ if ($addx);
   1088 	leaq	OPENSSL_ia32cap_P(%rip),%r11
   1089 	mov	8(%r11),%r11d
   1090 	and	\$0x80108,%r11d
   1091 	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
   1092 	je	.Lpowerx5_enter
   1093 ___
   1094 $code.=<<___;
   1095 	push	%rbx
   1096 .cfi_push	%rbx
   1097 	push	%rbp
   1098 .cfi_push	%rbp
   1099 	push	%r12
   1100 .cfi_push	%r12
   1101 	push	%r13
   1102 .cfi_push	%r13
   1103 	push	%r14
   1104 .cfi_push	%r14
   1105 	push	%r15
   1106 .cfi_push	%r15
   1107 .Lpower5_prologue:
   1108 
   1109 	shl	\$3,${num}d		# convert $num to bytes
   1110 	lea	($num,$num,2),%r10d	# 3*$num
   1111 	neg	$num
   1112 	mov	($n0),$n0		# *n0
   1113 
   1114 	##############################################################
   1115 	# Ensure that stack frame doesn't alias with $rptr+3*$num
   1116 	# modulo 4096, which covers ret[num], am[num] and n[num]
   1117 	# (see bn_exp.c). This is done to allow memory disambiguation
   1118 	# logic do its magic. [Extra 256 bytes is for power mask
   1119 	# calculated from 7th argument, the index.]
   1120 	#
   1121 	lea	-320(%rsp,$num,2),%r11
   1122 	mov	%rsp,%rbp
   1123 	sub	$rptr,%r11
   1124 	and	\$4095,%r11
   1125 	cmp	%r11,%r10
   1126 	jb	.Lpwr_sp_alt
   1127 	sub	%r11,%rbp		# align with $aptr
   1128 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
   1129 	jmp	.Lpwr_sp_done
   1130 
   1131 .align	32
   1132 .Lpwr_sp_alt:
   1133 	lea	4096-320(,$num,2),%r10
   1134 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
   1135 	sub	%r10,%r11
   1136 	mov	\$0,%r10
   1137 	cmovc	%r10,%r11
   1138 	sub	%r11,%rbp
   1139 .Lpwr_sp_done:
   1140 	and	\$-64,%rbp
   1141 	mov	%rsp,%r11
   1142 	sub	%rbp,%r11
   1143 	and	\$-4096,%r11
   1144 	lea	(%rbp,%r11),%rsp
   1145 	mov	(%rsp),%r10
   1146 	cmp	%rbp,%rsp
   1147 	ja	.Lpwr_page_walk
   1148 	jmp	.Lpwr_page_walk_done
   1149 
   1150 .Lpwr_page_walk:
   1151 	lea	-4096(%rsp),%rsp
   1152 	mov	(%rsp),%r10
   1153 	cmp	%rbp,%rsp
   1154 	ja	.Lpwr_page_walk
   1155 .Lpwr_page_walk_done:
   1156 
   1157 	mov	$num,%r10
   1158 	neg	$num
   1159 
   1160 	##############################################################
   1161 	# Stack layout
   1162 	#
   1163 	# +0	saved $num, used in reduction section
   1164 	# +8	&t[2*$num], used in reduction section
   1165 	# +32	saved *n0
   1166 	# +40	saved %rsp
   1167 	# +48	t[2*$num]
   1168 	#
   1169 	mov	$n0,  32(%rsp)
   1170 	mov	%rax, 40(%rsp)		# save original %rsp
   1171 .cfi_cfa_expression	%rsp+40,deref,+8
   1172 .Lpower5_body:
   1173 	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
   1174 	movq	$nptr,%xmm2		# save $nptr
   1175 	movq	%r10, %xmm3		# -$num, used in sqr8x
   1176 	movq	$bptr,%xmm4
   1177 
   1178 	call	__bn_sqr8x_internal
   1179 	call	__bn_post4x_internal
   1180 	call	__bn_sqr8x_internal
   1181 	call	__bn_post4x_internal
   1182 	call	__bn_sqr8x_internal
   1183 	call	__bn_post4x_internal
   1184 	call	__bn_sqr8x_internal
   1185 	call	__bn_post4x_internal
   1186 	call	__bn_sqr8x_internal
   1187 	call	__bn_post4x_internal
   1188 
   1189 	movq	%xmm2,$nptr
   1190 	movq	%xmm4,$bptr
   1191 	mov	$aptr,$rptr
   1192 	mov	40(%rsp),%rax
   1193 	lea	32(%rsp),$n0
   1194 
   1195 	call	mul4x_internal
   1196 
   1197 	mov	40(%rsp),%rsi		# restore %rsp
   1198 .cfi_def_cfa	%rsi,8
   1199 	mov	\$1,%rax
   1200 	mov	-48(%rsi),%r15
   1201 .cfi_restore	%r15
   1202 	mov	-40(%rsi),%r14
   1203 .cfi_restore	%r14
   1204 	mov	-32(%rsi),%r13
   1205 .cfi_restore	%r13
   1206 	mov	-24(%rsi),%r12
   1207 .cfi_restore	%r12
   1208 	mov	-16(%rsi),%rbp
   1209 .cfi_restore	%rbp
   1210 	mov	-8(%rsi),%rbx
   1211 .cfi_restore	%rbx
   1212 	lea	(%rsi),%rsp
   1213 .cfi_def_cfa_register	%rsp
   1214 .Lpower5_epilogue:
   1215 	ret
   1216 .cfi_endproc
   1217 .size	bn_power5,.-bn_power5
   1218 
   1219 .globl	bn_sqr8x_internal
   1220 .hidden	bn_sqr8x_internal
   1221 .type	bn_sqr8x_internal,\@abi-omnipotent
   1222 .align	32
   1223 bn_sqr8x_internal:
   1224 __bn_sqr8x_internal:
   1225 	##############################################################
   1226 	# Squaring part:
   1227 	#
   1228 	# a) multiply-n-add everything but a[i]*a[i];
   1229 	# b) shift result of a) by 1 to the left and accumulate
   1230 	#    a[i]*a[i] products;
   1231 	#
   1232 	##############################################################
   1233 	#                                                     a[1]a[0]
   1234 	#                                                 a[2]a[0]
   1235 	#                                             a[3]a[0]
   1236 	#                                             a[2]a[1]
   1237 	#                                         a[4]a[0]
   1238 	#                                         a[3]a[1]
   1239 	#                                     a[5]a[0]
   1240 	#                                     a[4]a[1]
   1241 	#                                     a[3]a[2]
   1242 	#                                 a[6]a[0]
   1243 	#                                 a[5]a[1]
   1244 	#                                 a[4]a[2]
   1245 	#                             a[7]a[0]
   1246 	#                             a[6]a[1]
   1247 	#                             a[5]a[2]
   1248 	#                             a[4]a[3]
   1249 	#                         a[7]a[1]
   1250 	#                         a[6]a[2]
   1251 	#                         a[5]a[3]
   1252 	#                     a[7]a[2]
   1253 	#                     a[6]a[3]
   1254 	#                     a[5]a[4]
   1255 	#                 a[7]a[3]
   1256 	#                 a[6]a[4]
   1257 	#             a[7]a[4]
   1258 	#             a[6]a[5]
   1259 	#         a[7]a[5]
   1260 	#     a[7]a[6]
   1261 	#                                                     a[1]a[0]
   1262 	#                                                 a[2]a[0]
   1263 	#                                             a[3]a[0]
   1264 	#                                         a[4]a[0]
   1265 	#                                     a[5]a[0]
   1266 	#                                 a[6]a[0]
   1267 	#                             a[7]a[0]
   1268 	#                                             a[2]a[1]
   1269 	#                                         a[3]a[1]
   1270 	#                                     a[4]a[1]
   1271 	#                                 a[5]a[1]
   1272 	#                             a[6]a[1]
   1273 	#                         a[7]a[1]
   1274 	#                                     a[3]a[2]
   1275 	#                                 a[4]a[2]
   1276 	#                             a[5]a[2]
   1277 	#                         a[6]a[2]
   1278 	#                     a[7]a[2]
   1279 	#                             a[4]a[3]
   1280 	#                         a[5]a[3]
   1281 	#                     a[6]a[3]
   1282 	#                 a[7]a[3]
   1283 	#                     a[5]a[4]
   1284 	#                 a[6]a[4]
   1285 	#             a[7]a[4]
   1286 	#             a[6]a[5]
   1287 	#         a[7]a[5]
   1288 	#     a[7]a[6]
   1289 	#                                                         a[0]a[0]
   1290 	#                                                 a[1]a[1]
   1291 	#                                         a[2]a[2]
   1292 	#                                 a[3]a[3]
   1293 	#                         a[4]a[4]
   1294 	#                 a[5]a[5]
   1295 	#         a[6]a[6]
   1296 	# a[7]a[7]
   1297 
   1298 	lea	32(%r10),$i		# $i=-($num-32)
   1299 	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
   1300 
   1301 	mov	$num,$j			# $j=$num
   1302 
   1303 					# comments apply to $num==8 case
   1304 	mov	-32($aptr,$i),$a0	# a[0]
   1305 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1306 	mov	-24($aptr,$i),%rax	# a[1]
   1307 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1308 	mov	-16($aptr,$i),$ai	# a[2]
   1309 	mov	%rax,$a1
   1310 
   1311 	mul	$a0			# a[1]*a[0]
   1312 	mov	%rax,$A0[0]		# a[1]*a[0]
   1313 	 mov	$ai,%rax		# a[2]
   1314 	mov	%rdx,$A0[1]
   1315 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1316 
   1317 	mul	$a0			# a[2]*a[0]
   1318 	add	%rax,$A0[1]
   1319 	 mov	$ai,%rax
   1320 	adc	\$0,%rdx
   1321 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1322 	mov	%rdx,$A0[0]
   1323 
   1324 
   1325 	 mov	-8($aptr,$i),$ai	# a[3]
   1326 	mul	$a1			# a[2]*a[1]
   1327 	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1328 	 mov	$ai,%rax
   1329 	mov	%rdx,$A1[1]
   1330 
   1331 	 lea	($i),$j
   1332 	mul	$a0			# a[3]*a[0]
   1333 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1334 	 mov	$ai,%rax
   1335 	mov	%rdx,$A0[1]
   1336 	adc	\$0,$A0[1]
   1337 	add	$A1[0],$A0[0]
   1338 	adc	\$0,$A0[1]
   1339 	mov	$A0[0],-8($tptr,$j)	# t[3]
   1340 	jmp	.Lsqr4x_1st
   1341 
   1342 .align	32
   1343 .Lsqr4x_1st:
   1344 	 mov	($aptr,$j),$ai		# a[4]
   1345 	mul	$a1			# a[3]*a[1]
   1346 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1347 	 mov	$ai,%rax
   1348 	mov	%rdx,$A1[0]
   1349 	adc	\$0,$A1[0]
   1350 
   1351 	mul	$a0			# a[4]*a[0]
   1352 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1353 	 mov	$ai,%rax		# a[3]
   1354 	 mov	8($aptr,$j),$ai		# a[5]
   1355 	mov	%rdx,$A0[0]
   1356 	adc	\$0,$A0[0]
   1357 	add	$A1[1],$A0[1]
   1358 	adc	\$0,$A0[0]
   1359 
   1360 
   1361 	mul	$a1			# a[4]*a[3]
   1362 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1363 	 mov	$ai,%rax
   1364 	 mov	$A0[1],($tptr,$j)	# t[4]
   1365 	mov	%rdx,$A1[1]
   1366 	adc	\$0,$A1[1]
   1367 
   1368 	mul	$a0			# a[5]*a[2]
   1369 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1370 	 mov	$ai,%rax
   1371 	 mov	16($aptr,$j),$ai	# a[6]
   1372 	mov	%rdx,$A0[1]
   1373 	adc	\$0,$A0[1]
   1374 	add	$A1[0],$A0[0]
   1375 	adc	\$0,$A0[1]
   1376 
   1377 	mul	$a1			# a[5]*a[3]
   1378 	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
   1379 	 mov	$ai,%rax
   1380 	 mov	$A0[0],8($tptr,$j)	# t[5]
   1381 	mov	%rdx,$A1[0]
   1382 	adc	\$0,$A1[0]
   1383 
   1384 	mul	$a0			# a[6]*a[2]
   1385 	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
   1386 	 mov	$ai,%rax		# a[3]
   1387 	 mov	24($aptr,$j),$ai	# a[7]
   1388 	mov	%rdx,$A0[0]
   1389 	adc	\$0,$A0[0]
   1390 	add	$A1[1],$A0[1]
   1391 	adc	\$0,$A0[0]
   1392 
   1393 
   1394 	mul	$a1			# a[6]*a[5]
   1395 	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
   1396 	 mov	$ai,%rax
   1397 	 mov	$A0[1],16($tptr,$j)	# t[6]
   1398 	mov	%rdx,$A1[1]
   1399 	adc	\$0,$A1[1]
   1400 	 lea	32($j),$j
   1401 
   1402 	mul	$a0			# a[7]*a[4]
   1403 	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
   1404 	 mov	$ai,%rax
   1405 	mov	%rdx,$A0[1]
   1406 	adc	\$0,$A0[1]
   1407 	add	$A1[0],$A0[0]
   1408 	adc	\$0,$A0[1]
   1409 	mov	$A0[0],-8($tptr,$j)	# t[7]
   1410 
   1411 	cmp	\$0,$j
   1412 	jne	.Lsqr4x_1st
   1413 
   1414 	mul	$a1			# a[7]*a[5]
   1415 	add	%rax,$A1[1]
   1416 	lea	16($i),$i
   1417 	adc	\$0,%rdx
   1418 	add	$A0[1],$A1[1]
   1419 	adc	\$0,%rdx
   1420 
   1421 	mov	$A1[1],($tptr)		# t[8]
   1422 	mov	%rdx,$A1[0]
   1423 	mov	%rdx,8($tptr)		# t[9]
   1424 	jmp	.Lsqr4x_outer
   1425 
   1426 .align	32
   1427 .Lsqr4x_outer:				# comments apply to $num==6 case
   1428 	mov	-32($aptr,$i),$a0	# a[0]
   1429 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1430 	mov	-24($aptr,$i),%rax	# a[1]
   1431 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1432 	mov	-16($aptr,$i),$ai	# a[2]
   1433 	mov	%rax,$a1
   1434 
   1435 	mul	$a0			# a[1]*a[0]
   1436 	mov	-24($tptr,$i),$A0[0]	# t[1]
   1437 	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
   1438 	 mov	$ai,%rax		# a[2]
   1439 	adc	\$0,%rdx
   1440 	mov	$A0[0],-24($tptr,$i)	# t[1]
   1441 	mov	%rdx,$A0[1]
   1442 
   1443 	mul	$a0			# a[2]*a[0]
   1444 	add	%rax,$A0[1]
   1445 	 mov	$ai,%rax
   1446 	adc	\$0,%rdx
   1447 	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
   1448 	mov	%rdx,$A0[0]
   1449 	adc	\$0,$A0[0]
   1450 	mov	$A0[1],-16($tptr,$i)	# t[2]
   1451 
   1452 	xor	$A1[0],$A1[0]
   1453 
   1454 	 mov	-8($aptr,$i),$ai	# a[3]
   1455 	mul	$a1			# a[2]*a[1]
   1456 	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
   1457 	 mov	$ai,%rax
   1458 	adc	\$0,%rdx
   1459 	add	-8($tptr,$i),$A1[0]
   1460 	mov	%rdx,$A1[1]
   1461 	adc	\$0,$A1[1]
   1462 
   1463 	mul	$a0			# a[3]*a[0]
   1464 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1465 	 mov	$ai,%rax
   1466 	adc	\$0,%rdx
   1467 	add	$A1[0],$A0[0]
   1468 	mov	%rdx,$A0[1]
   1469 	adc	\$0,$A0[1]
   1470 	mov	$A0[0],-8($tptr,$i)	# t[3]
   1471 
   1472 	lea	($i),$j
   1473 	jmp	.Lsqr4x_inner
   1474 
   1475 .align	32
   1476 .Lsqr4x_inner:
   1477 	 mov	($aptr,$j),$ai		# a[4]
   1478 	mul	$a1			# a[3]*a[1]
   1479 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
   1480 	 mov	$ai,%rax
   1481 	mov	%rdx,$A1[0]
   1482 	adc	\$0,$A1[0]
   1483 	add	($tptr,$j),$A1[1]
   1484 	adc	\$0,$A1[0]
   1485 
   1486 	.byte	0x67
   1487 	mul	$a0			# a[4]*a[0]
   1488 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
   1489 	 mov	$ai,%rax		# a[3]
   1490 	 mov	8($aptr,$j),$ai		# a[5]
   1491 	mov	%rdx,$A0[0]
   1492 	adc	\$0,$A0[0]
   1493 	add	$A1[1],$A0[1]
   1494 	adc	\$0,$A0[0]
   1495 
   1496 	mul	$a1			# a[4]*a[3]
   1497 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
   1498 	mov	$A0[1],($tptr,$j)	# t[4]
   1499 	 mov	$ai,%rax
   1500 	mov	%rdx,$A1[1]
   1501 	adc	\$0,$A1[1]
   1502 	add	8($tptr,$j),$A1[0]
   1503 	lea	16($j),$j		# j++
   1504 	adc	\$0,$A1[1]
   1505 
   1506 	mul	$a0			# a[5]*a[2]
   1507 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
   1508 	 mov	$ai,%rax
   1509 	adc	\$0,%rdx
   1510 	add	$A1[0],$A0[0]
   1511 	mov	%rdx,$A0[1]
   1512 	adc	\$0,$A0[1]
   1513 	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
   1514 
   1515 	cmp	\$0,$j
   1516 	jne	.Lsqr4x_inner
   1517 
   1518 	.byte	0x67
   1519 	mul	$a1			# a[5]*a[3]
   1520 	add	%rax,$A1[1]
   1521 	adc	\$0,%rdx
   1522 	add	$A0[1],$A1[1]
   1523 	adc	\$0,%rdx
   1524 
   1525 	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
   1526 	mov	%rdx,$A1[0]
   1527 	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
   1528 
   1529 	add	\$16,$i
   1530 	jnz	.Lsqr4x_outer
   1531 
   1532 					# comments apply to $num==4 case
   1533 	mov	-32($aptr),$a0		# a[0]
   1534 	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
   1535 	mov	-24($aptr),%rax		# a[1]
   1536 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
   1537 	mov	-16($aptr),$ai		# a[2]
   1538 	mov	%rax,$a1
   1539 
   1540 	mul	$a0			# a[1]*a[0]
   1541 	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
   1542 	 mov	$ai,%rax		# a[2]
   1543 	mov	%rdx,$A0[1]
   1544 	adc	\$0,$A0[1]
   1545 
   1546 	mul	$a0			# a[2]*a[0]
   1547 	add	%rax,$A0[1]
   1548 	 mov	$ai,%rax
   1549 	 mov	$A0[0],-24($tptr)	# t[1]
   1550 	mov	%rdx,$A0[0]
   1551 	adc	\$0,$A0[0]
   1552 	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
   1553 	 mov	-8($aptr),$ai		# a[3]
   1554 	adc	\$0,$A0[0]
   1555 
   1556 	mul	$a1			# a[2]*a[1]
   1557 	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
   1558 	 mov	$ai,%rax
   1559 	 mov	$A0[1],-16($tptr)	# t[2]
   1560 	mov	%rdx,$A1[1]
   1561 	adc	\$0,$A1[1]
   1562 
   1563 	mul	$a0			# a[3]*a[0]
   1564 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1565 	 mov	$ai,%rax
   1566 	mov	%rdx,$A0[1]
   1567 	adc	\$0,$A0[1]
   1568 	add	$A1[0],$A0[0]
   1569 	adc	\$0,$A0[1]
   1570 	mov	$A0[0],-8($tptr)	# t[3]
   1571 
   1572 	mul	$a1			# a[3]*a[1]
   1573 	add	%rax,$A1[1]
   1574 	 mov	-16($aptr),%rax		# a[2]
   1575 	adc	\$0,%rdx
   1576 	add	$A0[1],$A1[1]
   1577 	adc	\$0,%rdx
   1578 
   1579 	mov	$A1[1],($tptr)		# t[4]
   1580 	mov	%rdx,$A1[0]
   1581 	mov	%rdx,8($tptr)		# t[5]
   1582 
   1583 	mul	$ai			# a[2]*a[3]
   1584 ___
   1585 {
   1586 my ($shift,$carry)=($a0,$a1);
   1587 my @S=(@A1,$ai,$n0);
   1588 $code.=<<___;
   1589 	 add	\$16,$i
   1590 	 xor	$shift,$shift
   1591 	 sub	$num,$i			# $i=16-$num
   1592 	 xor	$carry,$carry
   1593 
   1594 	add	$A1[0],%rax		# t[5]
   1595 	adc	\$0,%rdx
   1596 	mov	%rax,8($tptr)		# t[5]
   1597 	mov	%rdx,16($tptr)		# t[6]
   1598 	mov	$carry,24($tptr)	# t[7]
   1599 
   1600 	 mov	-16($aptr,$i),%rax	# a[0]
   1601 	lea	48+8(%rsp),$tptr
   1602 	 xor	$A0[0],$A0[0]		# t[0]
   1603 	 mov	8($tptr),$A0[1]		# t[1]
   1604 
   1605 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1606 	shr	\$63,$A0[0]
   1607 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1608 	shr	\$63,$A0[1]
   1609 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1610 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1611 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1612 	mul	%rax			# a[i]*a[i]
   1613 	neg	$carry			# mov $carry,cf
   1614 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1615 	adc	%rax,$S[0]
   1616 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1617 	mov	$S[0],($tptr)
   1618 	adc	%rdx,$S[1]
   1619 
   1620 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1621 	 mov	$S[1],8($tptr)
   1622 	 sbb	$carry,$carry		# mov cf,$carry
   1623 	shr	\$63,$A0[0]
   1624 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1625 	shr	\$63,$A0[1]
   1626 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1627 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1628 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1629 	mul	%rax			# a[i]*a[i]
   1630 	neg	$carry			# mov $carry,cf
   1631 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1632 	adc	%rax,$S[2]
   1633 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1634 	mov	$S[2],16($tptr)
   1635 	adc	%rdx,$S[3]
   1636 	lea	16($i),$i
   1637 	mov	$S[3],24($tptr)
   1638 	sbb	$carry,$carry		# mov cf,$carry
   1639 	lea	64($tptr),$tptr
   1640 	jmp	.Lsqr4x_shift_n_add
   1641 
   1642 .align	32
   1643 .Lsqr4x_shift_n_add:
   1644 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1645 	shr	\$63,$A0[0]
   1646 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1647 	shr	\$63,$A0[1]
   1648 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1649 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1650 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1651 	mul	%rax			# a[i]*a[i]
   1652 	neg	$carry			# mov $carry,cf
   1653 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1654 	adc	%rax,$S[0]
   1655 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1656 	mov	$S[0],-32($tptr)
   1657 	adc	%rdx,$S[1]
   1658 
   1659 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1660 	 mov	$S[1],-24($tptr)
   1661 	 sbb	$carry,$carry		# mov cf,$carry
   1662 	shr	\$63,$A0[0]
   1663 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1664 	shr	\$63,$A0[1]
   1665 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1666 	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
   1667 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1668 	mul	%rax			# a[i]*a[i]
   1669 	neg	$carry			# mov $carry,cf
   1670 	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
   1671 	adc	%rax,$S[2]
   1672 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1673 	mov	$S[2],-16($tptr)
   1674 	adc	%rdx,$S[3]
   1675 
   1676 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1677 	 mov	$S[3],-8($tptr)
   1678 	 sbb	$carry,$carry		# mov cf,$carry
   1679 	shr	\$63,$A0[0]
   1680 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1681 	shr	\$63,$A0[1]
   1682 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1683 	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1684 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1685 	mul	%rax			# a[i]*a[i]
   1686 	neg	$carry			# mov $carry,cf
   1687 	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1688 	adc	%rax,$S[0]
   1689 	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
   1690 	mov	$S[0],0($tptr)
   1691 	adc	%rdx,$S[1]
   1692 
   1693 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1694 	 mov	$S[1],8($tptr)
   1695 	 sbb	$carry,$carry		# mov cf,$carry
   1696 	shr	\$63,$A0[0]
   1697 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1698 	shr	\$63,$A0[1]
   1699 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1700 	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1701 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1702 	mul	%rax			# a[i]*a[i]
   1703 	neg	$carry			# mov $carry,cf
   1704 	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1705 	adc	%rax,$S[2]
   1706 	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
   1707 	mov	$S[2],16($tptr)
   1708 	adc	%rdx,$S[3]
   1709 	mov	$S[3],24($tptr)
   1710 	sbb	$carry,$carry		# mov cf,$carry
   1711 	lea	64($tptr),$tptr
   1712 	add	\$32,$i
   1713 	jnz	.Lsqr4x_shift_n_add
   1714 
   1715 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1716 	.byte	0x67
   1717 	shr	\$63,$A0[0]
   1718 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1719 	shr	\$63,$A0[1]
   1720 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1721 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1722 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1723 	mul	%rax			# a[i]*a[i]
   1724 	neg	$carry			# mov $carry,cf
   1725 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1726 	adc	%rax,$S[0]
   1727 	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
   1728 	mov	$S[0],-32($tptr)
   1729 	adc	%rdx,$S[1]
   1730 
   1731 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
   1732 	 mov	$S[1],-24($tptr)
   1733 	 sbb	$carry,$carry		# mov cf,$carry
   1734 	shr	\$63,$A0[0]
   1735 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1736 	shr	\$63,$A0[1]
   1737 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1738 	mul	%rax			# a[i]*a[i]
   1739 	neg	$carry			# mov $carry,cf
   1740 	adc	%rax,$S[2]
   1741 	adc	%rdx,$S[3]
   1742 	mov	$S[2],-16($tptr)
   1743 	mov	$S[3],-8($tptr)
   1744 ___
   1745 }
   1747 ######################################################################
   1748 # Montgomery reduction part, "word-by-word" algorithm.
   1749 #
   1750 # This new path is inspired by multiple submissions from Intel, by
   1751 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   1752 # Vinodh Gopal...
   1753 {
   1754 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
   1755 
   1756 $code.=<<___;
   1757 	movq	%xmm2,$nptr
   1758 __bn_sqr8x_reduction:
   1759 	xor	%rax,%rax
   1760 	lea	($nptr,$num),%rcx	# end of n[]
   1761 	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
   1762 	mov	%rcx,0+8(%rsp)
   1763 	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
   1764 	mov	%rdx,8+8(%rsp)
   1765 	neg	$num
   1766 	jmp	.L8x_reduction_loop
   1767 
   1768 .align	32
   1769 .L8x_reduction_loop:
   1770 	lea	($tptr,$num),$tptr	# start of current t[] window
   1771 	.byte	0x66
   1772 	mov	8*0($tptr),$m0
   1773 	mov	8*1($tptr),%r9
   1774 	mov	8*2($tptr),%r10
   1775 	mov	8*3($tptr),%r11
   1776 	mov	8*4($tptr),%r12
   1777 	mov	8*5($tptr),%r13
   1778 	mov	8*6($tptr),%r14
   1779 	mov	8*7($tptr),%r15
   1780 	mov	%rax,(%rdx)		# store top-most carry bit
   1781 	lea	8*8($tptr),$tptr
   1782 
   1783 	.byte	0x67
   1784 	mov	$m0,%r8
   1785 	imulq	32+8(%rsp),$m0		# n0*a[0]
   1786 	mov	8*0($nptr),%rax		# n[0]
   1787 	mov	\$8,%ecx
   1788 	jmp	.L8x_reduce
   1789 
   1790 .align	32
   1791 .L8x_reduce:
   1792 	mulq	$m0
   1793 	 mov	8*1($nptr),%rax		# n[1]
   1794 	neg	%r8
   1795 	mov	%rdx,%r8
   1796 	adc	\$0,%r8
   1797 
   1798 	mulq	$m0
   1799 	add	%rax,%r9
   1800 	 mov	8*2($nptr),%rax
   1801 	adc	\$0,%rdx
   1802 	add	%r9,%r8
   1803 	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
   1804 	mov	%rdx,%r9
   1805 	adc	\$0,%r9
   1806 
   1807 	mulq	$m0
   1808 	add	%rax,%r10
   1809 	 mov	8*3($nptr),%rax
   1810 	adc	\$0,%rdx
   1811 	add	%r10,%r9
   1812 	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
   1813 	mov	%rdx,%r10
   1814 	adc	\$0,%r10
   1815 
   1816 	mulq	$m0
   1817 	add	%rax,%r11
   1818 	 mov	8*4($nptr),%rax
   1819 	adc	\$0,%rdx
   1820 	 imulq	%r8,$carry		# modulo-scheduled
   1821 	add	%r11,%r10
   1822 	mov	%rdx,%r11
   1823 	adc	\$0,%r11
   1824 
   1825 	mulq	$m0
   1826 	add	%rax,%r12
   1827 	 mov	8*5($nptr),%rax
   1828 	adc	\$0,%rdx
   1829 	add	%r12,%r11
   1830 	mov	%rdx,%r12
   1831 	adc	\$0,%r12
   1832 
   1833 	mulq	$m0
   1834 	add	%rax,%r13
   1835 	 mov	8*6($nptr),%rax
   1836 	adc	\$0,%rdx
   1837 	add	%r13,%r12
   1838 	mov	%rdx,%r13
   1839 	adc	\$0,%r13
   1840 
   1841 	mulq	$m0
   1842 	add	%rax,%r14
   1843 	 mov	8*7($nptr),%rax
   1844 	adc	\$0,%rdx
   1845 	add	%r14,%r13
   1846 	mov	%rdx,%r14
   1847 	adc	\$0,%r14
   1848 
   1849 	mulq	$m0
   1850 	 mov	$carry,$m0		# n0*a[i]
   1851 	add	%rax,%r15
   1852 	 mov	8*0($nptr),%rax		# n[0]
   1853 	adc	\$0,%rdx
   1854 	add	%r15,%r14
   1855 	mov	%rdx,%r15
   1856 	adc	\$0,%r15
   1857 
   1858 	dec	%ecx
   1859 	jnz	.L8x_reduce
   1860 
   1861 	lea	8*8($nptr),$nptr
   1862 	xor	%rax,%rax
   1863 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1864 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1865 	jae	.L8x_no_tail
   1866 
   1867 	.byte	0x66
   1868 	add	8*0($tptr),%r8
   1869 	adc	8*1($tptr),%r9
   1870 	adc	8*2($tptr),%r10
   1871 	adc	8*3($tptr),%r11
   1872 	adc	8*4($tptr),%r12
   1873 	adc	8*5($tptr),%r13
   1874 	adc	8*6($tptr),%r14
   1875 	adc	8*7($tptr),%r15
   1876 	sbb	$carry,$carry		# top carry
   1877 
   1878 	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1879 	mov	\$8,%ecx
   1880 	mov	8*0($nptr),%rax
   1881 	jmp	.L8x_tail
   1882 
   1883 .align	32
   1884 .L8x_tail:
   1885 	mulq	$m0
   1886 	add	%rax,%r8
   1887 	 mov	8*1($nptr),%rax
   1888 	 mov	%r8,($tptr)		# save result
   1889 	mov	%rdx,%r8
   1890 	adc	\$0,%r8
   1891 
   1892 	mulq	$m0
   1893 	add	%rax,%r9
   1894 	 mov	8*2($nptr),%rax
   1895 	adc	\$0,%rdx
   1896 	add	%r9,%r8
   1897 	 lea	8($tptr),$tptr		# $tptr++
   1898 	mov	%rdx,%r9
   1899 	adc	\$0,%r9
   1900 
   1901 	mulq	$m0
   1902 	add	%rax,%r10
   1903 	 mov	8*3($nptr),%rax
   1904 	adc	\$0,%rdx
   1905 	add	%r10,%r9
   1906 	mov	%rdx,%r10
   1907 	adc	\$0,%r10
   1908 
   1909 	mulq	$m0
   1910 	add	%rax,%r11
   1911 	 mov	8*4($nptr),%rax
   1912 	adc	\$0,%rdx
   1913 	add	%r11,%r10
   1914 	mov	%rdx,%r11
   1915 	adc	\$0,%r11
   1916 
   1917 	mulq	$m0
   1918 	add	%rax,%r12
   1919 	 mov	8*5($nptr),%rax
   1920 	adc	\$0,%rdx
   1921 	add	%r12,%r11
   1922 	mov	%rdx,%r12
   1923 	adc	\$0,%r12
   1924 
   1925 	mulq	$m0
   1926 	add	%rax,%r13
   1927 	 mov	8*6($nptr),%rax
   1928 	adc	\$0,%rdx
   1929 	add	%r13,%r12
   1930 	mov	%rdx,%r13
   1931 	adc	\$0,%r13
   1932 
   1933 	mulq	$m0
   1934 	add	%rax,%r14
   1935 	 mov	8*7($nptr),%rax
   1936 	adc	\$0,%rdx
   1937 	add	%r14,%r13
   1938 	mov	%rdx,%r14
   1939 	adc	\$0,%r14
   1940 
   1941 	mulq	$m0
   1942 	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
   1943 	add	%rax,%r15
   1944 	adc	\$0,%rdx
   1945 	add	%r15,%r14
   1946 	 mov	8*0($nptr),%rax		# pull n[0]
   1947 	mov	%rdx,%r15
   1948 	adc	\$0,%r15
   1949 
   1950 	dec	%ecx
   1951 	jnz	.L8x_tail
   1952 
   1953 	lea	8*8($nptr),$nptr
   1954 	mov	8+8(%rsp),%rdx		# pull end of t[]
   1955 	cmp	0+8(%rsp),$nptr		# end of n[]?
   1956 	jae	.L8x_tail_done		# break out of loop
   1957 
   1958 	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
   1959 	neg	$carry
   1960 	 mov	8*0($nptr),%rax		# pull n[0]
   1961 	adc	8*0($tptr),%r8
   1962 	adc	8*1($tptr),%r9
   1963 	adc	8*2($tptr),%r10
   1964 	adc	8*3($tptr),%r11
   1965 	adc	8*4($tptr),%r12
   1966 	adc	8*5($tptr),%r13
   1967 	adc	8*6($tptr),%r14
   1968 	adc	8*7($tptr),%r15
   1969 	sbb	$carry,$carry		# top carry
   1970 
   1971 	mov	\$8,%ecx
   1972 	jmp	.L8x_tail
   1973 
   1974 .align	32
   1975 .L8x_tail_done:
   1976 	xor	%rax,%rax
   1977 	add	(%rdx),%r8		# can this overflow?
   1978 	adc	\$0,%r9
   1979 	adc	\$0,%r10
   1980 	adc	\$0,%r11
   1981 	adc	\$0,%r12
   1982 	adc	\$0,%r13
   1983 	adc	\$0,%r14
   1984 	adc	\$0,%r15
   1985 	adc	\$0,%rax
   1986 
   1987 	neg	$carry
   1988 .L8x_no_tail:
   1989 	adc	8*0($tptr),%r8
   1990 	adc	8*1($tptr),%r9
   1991 	adc	8*2($tptr),%r10
   1992 	adc	8*3($tptr),%r11
   1993 	adc	8*4($tptr),%r12
   1994 	adc	8*5($tptr),%r13
   1995 	adc	8*6($tptr),%r14
   1996 	adc	8*7($tptr),%r15
   1997 	adc	\$0,%rax		# top-most carry
   1998 	 mov	-8($nptr),%rcx		# np[num-1]
   1999 	 xor	$carry,$carry
   2000 
   2001 	movq	%xmm2,$nptr		# restore $nptr
   2002 
   2003 	mov	%r8,8*0($tptr)		# store top 512 bits
   2004 	mov	%r9,8*1($tptr)
   2005 	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
   2006 	mov	%r10,8*2($tptr)
   2007 	mov	%r11,8*3($tptr)
   2008 	mov	%r12,8*4($tptr)
   2009 	mov	%r13,8*5($tptr)
   2010 	mov	%r14,8*6($tptr)
   2011 	mov	%r15,8*7($tptr)
   2012 	lea	8*8($tptr),$tptr
   2013 
   2014 	cmp	%rdx,$tptr		# end of t[]?
   2015 	jb	.L8x_reduction_loop
   2016 	ret
   2017 .size	bn_sqr8x_internal,.-bn_sqr8x_internal
   2018 ___
   2019 }
   2021 ##############################################################
   2022 # Post-condition, 4x unrolled
   2023 #
   2024 {
   2025 my ($tptr,$nptr)=("%rbx","%rbp");
   2026 $code.=<<___;
   2027 .type	__bn_post4x_internal,\@abi-omnipotent
   2028 .align	32
   2029 __bn_post4x_internal:
   2030 	mov	8*0($nptr),%r12
   2031 	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
   2032 	mov	$num,%rcx
   2033 	movq	%xmm1,$rptr		# restore $rptr
   2034 	neg	%rax
   2035 	movq	%xmm1,$aptr		# prepare for back-to-back call
   2036 	sar	\$3+2,%rcx
   2037 	dec	%r12			# so that after 'not' we get -n[0]
   2038 	xor	%r10,%r10
   2039 	mov	8*1($nptr),%r13
   2040 	mov	8*2($nptr),%r14
   2041 	mov	8*3($nptr),%r15
   2042 	jmp	.Lsqr4x_sub_entry
   2043 
   2044 .align	16
   2045 .Lsqr4x_sub:
   2046 	mov	8*0($nptr),%r12
   2047 	mov	8*1($nptr),%r13
   2048 	mov	8*2($nptr),%r14
   2049 	mov	8*3($nptr),%r15
   2050 .Lsqr4x_sub_entry:
   2051 	lea	8*4($nptr),$nptr
   2052 	not	%r12
   2053 	not	%r13
   2054 	not	%r14
   2055 	not	%r15
   2056 	and	%rax,%r12
   2057 	and	%rax,%r13
   2058 	and	%rax,%r14
   2059 	and	%rax,%r15
   2060 
   2061 	neg	%r10			# mov %r10,%cf
   2062 	adc	8*0($tptr),%r12
   2063 	adc	8*1($tptr),%r13
   2064 	adc	8*2($tptr),%r14
   2065 	adc	8*3($tptr),%r15
   2066 	mov	%r12,8*0($rptr)
   2067 	lea	8*4($tptr),$tptr
   2068 	mov	%r13,8*1($rptr)
   2069 	sbb	%r10,%r10		# mov %cf,%r10
   2070 	mov	%r14,8*2($rptr)
   2071 	mov	%r15,8*3($rptr)
   2072 	lea	8*4($rptr),$rptr
   2073 
   2074 	inc	%rcx			# pass %cf
   2075 	jnz	.Lsqr4x_sub
   2076 
   2077 	mov	$num,%r10		# prepare for back-to-back call
   2078 	neg	$num			# restore $num
   2079 	ret
   2080 .size	__bn_post4x_internal,.-__bn_post4x_internal
   2081 ___
   2082 }
   2083 {
   2084 $code.=<<___;
   2085 .globl	bn_from_montgomery
   2086 .type	bn_from_montgomery,\@abi-omnipotent
   2087 .align	32
   2088 bn_from_montgomery:
   2089 	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
   2090 	jz	bn_from_mont8x
   2091 	xor	%eax,%eax
   2092 	ret
   2093 .size	bn_from_montgomery,.-bn_from_montgomery
   2094 
   2095 .type	bn_from_mont8x,\@function,6
   2096 .align	32
   2097 bn_from_mont8x:
   2098 .cfi_startproc
   2099 	.byte	0x67
   2100 	mov	%rsp,%rax
   2101 .cfi_def_cfa_register	%rax
   2102 	push	%rbx
   2103 .cfi_push	%rbx
   2104 	push	%rbp
   2105 .cfi_push	%rbp
   2106 	push	%r12
   2107 .cfi_push	%r12
   2108 	push	%r13
   2109 .cfi_push	%r13
   2110 	push	%r14
   2111 .cfi_push	%r14
   2112 	push	%r15
   2113 .cfi_push	%r15
   2114 .Lfrom_prologue:
   2115 
   2116 	shl	\$3,${num}d		# convert $num to bytes
   2117 	lea	($num,$num,2),%r10	# 3*$num in bytes
   2118 	neg	$num
   2119 	mov	($n0),$n0		# *n0
   2120 
   2121 	##############################################################
   2122 	# Ensure that stack frame doesn't alias with $rptr+3*$num
   2123 	# modulo 4096, which covers ret[num], am[num] and n[num]
   2124 	# (see bn_exp.c). The stack is allocated to aligned with
   2125 	# bn_power5's frame, and as bn_from_montgomery happens to be
   2126 	# last operation, we use the opportunity to cleanse it.
   2127 	#
   2128 	lea	-320(%rsp,$num,2),%r11
   2129 	mov	%rsp,%rbp
   2130 	sub	$rptr,%r11
   2131 	and	\$4095,%r11
   2132 	cmp	%r11,%r10
   2133 	jb	.Lfrom_sp_alt
   2134 	sub	%r11,%rbp		# align with $aptr
   2135 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
   2136 	jmp	.Lfrom_sp_done
   2137 
   2138 .align	32
   2139 .Lfrom_sp_alt:
   2140 	lea	4096-320(,$num,2),%r10
   2141 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
   2142 	sub	%r10,%r11
   2143 	mov	\$0,%r10
   2144 	cmovc	%r10,%r11
   2145 	sub	%r11,%rbp
   2146 .Lfrom_sp_done:
   2147 	and	\$-64,%rbp
   2148 	mov	%rsp,%r11
   2149 	sub	%rbp,%r11
   2150 	and	\$-4096,%r11
   2151 	lea	(%rbp,%r11),%rsp
   2152 	mov	(%rsp),%r10
   2153 	cmp	%rbp,%rsp
   2154 	ja	.Lfrom_page_walk
   2155 	jmp	.Lfrom_page_walk_done
   2156 
   2157 .Lfrom_page_walk:
   2158 	lea	-4096(%rsp),%rsp
   2159 	mov	(%rsp),%r10
   2160 	cmp	%rbp,%rsp
   2161 	ja	.Lfrom_page_walk
   2162 .Lfrom_page_walk_done:
   2163 
   2164 	mov	$num,%r10
   2165 	neg	$num
   2166 
   2167 	##############################################################
   2168 	# Stack layout
   2169 	#
   2170 	# +0	saved $num, used in reduction section
   2171 	# +8	&t[2*$num], used in reduction section
   2172 	# +32	saved *n0
   2173 	# +40	saved %rsp
   2174 	# +48	t[2*$num]
   2175 	#
   2176 	mov	$n0,  32(%rsp)
   2177 	mov	%rax, 40(%rsp)		# save original %rsp
   2178 .cfi_cfa_expression	%rsp+40,deref,+8
   2179 .Lfrom_body:
   2180 	mov	$num,%r11
   2181 	lea	48(%rsp),%rax
   2182 	pxor	%xmm0,%xmm0
   2183 	jmp	.Lmul_by_1
   2184 
   2185 .align	32
   2186 .Lmul_by_1:
   2187 	movdqu	($aptr),%xmm1
   2188 	movdqu	16($aptr),%xmm2
   2189 	movdqu	32($aptr),%xmm3
   2190 	movdqa	%xmm0,(%rax,$num)
   2191 	movdqu	48($aptr),%xmm4
   2192 	movdqa	%xmm0,16(%rax,$num)
   2193 	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
   2194 	movdqa	%xmm1,(%rax)
   2195 	movdqa	%xmm0,32(%rax,$num)
   2196 	movdqa	%xmm2,16(%rax)
   2197 	movdqa	%xmm0,48(%rax,$num)
   2198 	movdqa	%xmm3,32(%rax)
   2199 	movdqa	%xmm4,48(%rax)
   2200 	lea	64(%rax),%rax
   2201 	sub	\$64,%r11
   2202 	jnz	.Lmul_by_1
   2203 
   2204 	movq	$rptr,%xmm1
   2205 	movq	$nptr,%xmm2
   2206 	.byte	0x67
   2207 	mov	$nptr,%rbp
   2208 	movq	%r10, %xmm3		# -num
   2209 ___
   2210 $code.=<<___ if ($addx);
   2211 	leaq	OPENSSL_ia32cap_P(%rip),%r11
   2212 	mov	8(%r11),%r11d
   2213 	and	\$0x80108,%r11d
   2214 	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
   2215 	jne	.Lfrom_mont_nox
   2216 
   2217 	lea	(%rax,$num),$rptr
   2218 	call	__bn_sqrx8x_reduction
   2219 	call	__bn_postx4x_internal
   2220 
   2221 	pxor	%xmm0,%xmm0
   2222 	lea	48(%rsp),%rax
   2223 	jmp	.Lfrom_mont_zero
   2224 
   2225 .align	32
   2226 .Lfrom_mont_nox:
   2227 ___
   2228 $code.=<<___;
   2229 	call	__bn_sqr8x_reduction
   2230 	call	__bn_post4x_internal
   2231 
   2232 	pxor	%xmm0,%xmm0
   2233 	lea	48(%rsp),%rax
   2234 	jmp	.Lfrom_mont_zero
   2235 
   2236 .align	32
   2237 .Lfrom_mont_zero:
   2238 	mov	40(%rsp),%rsi		# restore %rsp
   2239 .cfi_def_cfa	%rsi,8
   2240 	movdqa	%xmm0,16*0(%rax)
   2241 	movdqa	%xmm0,16*1(%rax)
   2242 	movdqa	%xmm0,16*2(%rax)
   2243 	movdqa	%xmm0,16*3(%rax)
   2244 	lea	16*4(%rax),%rax
   2245 	sub	\$32,$num
   2246 	jnz	.Lfrom_mont_zero
   2247 
   2248 	mov	\$1,%rax
   2249 	mov	-48(%rsi),%r15
   2250 .cfi_restore	%r15
   2251 	mov	-40(%rsi),%r14
   2252 .cfi_restore	%r14
   2253 	mov	-32(%rsi),%r13
   2254 .cfi_restore	%r13
   2255 	mov	-24(%rsi),%r12
   2256 .cfi_restore	%r12
   2257 	mov	-16(%rsi),%rbp
   2258 .cfi_restore	%rbp
   2259 	mov	-8(%rsi),%rbx
   2260 .cfi_restore	%rbx
   2261 	lea	(%rsi),%rsp
   2262 .cfi_def_cfa_register	%rsp
   2263 .Lfrom_epilogue:
   2264 	ret
   2265 .cfi_endproc
   2266 .size	bn_from_mont8x,.-bn_from_mont8x
   2267 ___
   2268 }
   2269 }}}
   2270 
   2272 if ($addx) {{{
   2273 my $bp="%rdx";	# restore original value
   2274 
   2275 $code.=<<___;
   2276 .type	bn_mulx4x_mont_gather5,\@function,6
   2277 .align	32
   2278 bn_mulx4x_mont_gather5:
   2279 .cfi_startproc
   2280 	mov	%rsp,%rax
   2281 .cfi_def_cfa_register	%rax
   2282 .Lmulx4x_enter:
   2283 	push	%rbx
   2284 .cfi_push	%rbx
   2285 	push	%rbp
   2286 .cfi_push	%rbp
   2287 	push	%r12
   2288 .cfi_push	%r12
   2289 	push	%r13
   2290 .cfi_push	%r13
   2291 	push	%r14
   2292 .cfi_push	%r14
   2293 	push	%r15
   2294 .cfi_push	%r15
   2295 .Lmulx4x_prologue:
   2296 
   2297 	shl	\$3,${num}d		# convert $num to bytes
   2298 	lea	($num,$num,2),%r10	# 3*$num in bytes
   2299 	neg	$num			# -$num
   2300 	mov	($n0),$n0		# *n0
   2301 
   2302 	##############################################################
   2303 	# Ensure that stack frame doesn't alias with $rptr+3*$num
   2304 	# modulo 4096, which covers ret[num], am[num] and n[num]
   2305 	# (see bn_exp.c). This is done to allow memory disambiguation
   2306 	# logic do its magic. [Extra [num] is allocated in order
   2307 	# to align with bn_power5's frame, which is cleansed after
   2308 	# completing exponentiation. Extra 256 bytes is for power mask
   2309 	# calculated from 7th argument, the index.]
   2310 	#
   2311 	lea	-320(%rsp,$num,2),%r11
   2312 	mov	%rsp,%rbp
   2313 	sub	$rp,%r11
   2314 	and	\$4095,%r11
   2315 	cmp	%r11,%r10
   2316 	jb	.Lmulx4xsp_alt
   2317 	sub	%r11,%rbp		# align with $aptr
   2318 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
   2319 	jmp	.Lmulx4xsp_done
   2320 
   2321 .Lmulx4xsp_alt:
   2322 	lea	4096-320(,$num,2),%r10
   2323 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
   2324 	sub	%r10,%r11
   2325 	mov	\$0,%r10
   2326 	cmovc	%r10,%r11
   2327 	sub	%r11,%rbp
   2328 .Lmulx4xsp_done:
   2329 	and	\$-64,%rbp		# ensure alignment
   2330 	mov	%rsp,%r11
   2331 	sub	%rbp,%r11
   2332 	and	\$-4096,%r11
   2333 	lea	(%rbp,%r11),%rsp
   2334 	mov	(%rsp),%r10
   2335 	cmp	%rbp,%rsp
   2336 	ja	.Lmulx4x_page_walk
   2337 	jmp	.Lmulx4x_page_walk_done
   2338 
   2339 .Lmulx4x_page_walk:
   2340 	lea	-4096(%rsp),%rsp
   2341 	mov	(%rsp),%r10
   2342 	cmp	%rbp,%rsp
   2343 	ja	.Lmulx4x_page_walk
   2344 .Lmulx4x_page_walk_done:
   2345 
   2346 	##############################################################
   2347 	# Stack layout
   2348 	# +0	-num
   2349 	# +8	off-loaded &b[i]
   2350 	# +16	end of b[num]
   2351 	# +24	inner counter
   2352 	# +32	saved n0
   2353 	# +40	saved %rsp
   2354 	# +48
   2355 	# +56	saved rp
   2356 	# +64	tmp[num+1]
   2357 	#
   2358 	mov	$n0, 32(%rsp)		# save *n0
   2359 	mov	%rax,40(%rsp)		# save original %rsp
   2360 .cfi_cfa_expression	%rsp+40,deref,+8
   2361 .Lmulx4x_body:
   2362 	call	mulx4x_internal
   2363 
   2364 	mov	40(%rsp),%rsi		# restore %rsp
   2365 .cfi_def_cfa	%rsi,8
   2366 	mov	\$1,%rax
   2367 
   2368 	mov	-48(%rsi),%r15
   2369 .cfi_restore	%r15
   2370 	mov	-40(%rsi),%r14
   2371 .cfi_restore	%r14
   2372 	mov	-32(%rsi),%r13
   2373 .cfi_restore	%r13
   2374 	mov	-24(%rsi),%r12
   2375 .cfi_restore	%r12
   2376 	mov	-16(%rsi),%rbp
   2377 .cfi_restore	%rbp
   2378 	mov	-8(%rsi),%rbx
   2379 .cfi_restore	%rbx
   2380 	lea	(%rsi),%rsp
   2381 .cfi_def_cfa_register	%rsp
   2382 .Lmulx4x_epilogue:
   2383 	ret
   2384 .cfi_endproc
   2385 .size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
   2386 
   2387 .type	mulx4x_internal,\@abi-omnipotent
   2388 .align	32
   2389 mulx4x_internal:
   2390 	mov	$num,8(%rsp)		# save -$num (it was in bytes)
   2391 	mov	$num,%r10
   2392 	neg	$num			# restore $num
   2393 	shl	\$5,$num
   2394 	neg	%r10			# restore $num
   2395 	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
   2396 	shr	\$5+5,$num
   2397 	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
   2398 	sub	\$1,$num
   2399 	lea	.Linc(%rip),%rax
   2400 	mov	%r13,16+8(%rsp)		# end of b[num]
   2401 	mov	$num,24+8(%rsp)		# inner counter
   2402 	mov	$rp, 56+8(%rsp)		# save $rp
   2403 ___
   2404 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
   2405    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
   2406 my $rptr=$bptr;
   2407 my $STRIDE=2**5*8;		# 5 is "window size"
   2408 my $N=$STRIDE/4;		# should match cache line size
   2409 $code.=<<___;
   2410 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
   2411 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
   2412 	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
   2413 	lea	128($bp),$bptr		# size optimization
   2414 
   2415 	pshufd	\$0,%xmm5,%xmm5		# broadcast index
   2416 	movdqa	%xmm1,%xmm4
   2417 	.byte	0x67
   2418 	movdqa	%xmm1,%xmm2
   2419 ___
   2420 ########################################################################
   2421 # calculate mask by comparing 0..31 to index and save result to stack
   2422 #
   2423 $code.=<<___;
   2424 	.byte	0x67
   2425 	paddd	%xmm0,%xmm1
   2426 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
   2427 	movdqa	%xmm4,%xmm3
   2428 ___
   2429 for($i=0;$i<$STRIDE/16-4;$i+=4) {
   2430 $code.=<<___;
   2431 	paddd	%xmm1,%xmm2
   2432 	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
   2433 	movdqa	%xmm0,`16*($i+0)+112`(%r10)
   2434 	movdqa	%xmm4,%xmm0
   2435 
   2436 	paddd	%xmm2,%xmm3
   2437 	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
   2438 	movdqa	%xmm1,`16*($i+1)+112`(%r10)
   2439 	movdqa	%xmm4,%xmm1
   2440 
   2441 	paddd	%xmm3,%xmm0
   2442 	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
   2443 	movdqa	%xmm2,`16*($i+2)+112`(%r10)
   2444 	movdqa	%xmm4,%xmm2
   2445 
   2446 	paddd	%xmm0,%xmm1
   2447 	pcmpeqd	%xmm5,%xmm0
   2448 	movdqa	%xmm3,`16*($i+3)+112`(%r10)
   2449 	movdqa	%xmm4,%xmm3
   2450 ___
   2451 }
   2452 $code.=<<___;				# last iteration can be optimized
   2453 	.byte	0x67
   2454 	paddd	%xmm1,%xmm2
   2455 	pcmpeqd	%xmm5,%xmm1
   2456 	movdqa	%xmm0,`16*($i+0)+112`(%r10)
   2457 
   2458 	paddd	%xmm2,%xmm3
   2459 	pcmpeqd	%xmm5,%xmm2
   2460 	movdqa	%xmm1,`16*($i+1)+112`(%r10)
   2461 
   2462 	pcmpeqd	%xmm5,%xmm3
   2463 	movdqa	%xmm2,`16*($i+2)+112`(%r10)
   2464 
   2465 	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
   2466 	pand	`16*($i+1)-128`($bptr),%xmm1
   2467 	pand	`16*($i+2)-128`($bptr),%xmm2
   2468 	movdqa	%xmm3,`16*($i+3)+112`(%r10)
   2469 	pand	`16*($i+3)-128`($bptr),%xmm3
   2470 	por	%xmm2,%xmm0
   2471 	por	%xmm3,%xmm1
   2472 ___
   2473 for($i=0;$i<$STRIDE/16-4;$i+=4) {
   2474 $code.=<<___;
   2475 	movdqa	`16*($i+0)-128`($bptr),%xmm4
   2476 	movdqa	`16*($i+1)-128`($bptr),%xmm5
   2477 	movdqa	`16*($i+2)-128`($bptr),%xmm2
   2478 	pand	`16*($i+0)+112`(%r10),%xmm4
   2479 	movdqa	`16*($i+3)-128`($bptr),%xmm3
   2480 	pand	`16*($i+1)+112`(%r10),%xmm5
   2481 	por	%xmm4,%xmm0
   2482 	pand	`16*($i+2)+112`(%r10),%xmm2
   2483 	por	%xmm5,%xmm1
   2484 	pand	`16*($i+3)+112`(%r10),%xmm3
   2485 	por	%xmm2,%xmm0
   2486 	por	%xmm3,%xmm1
   2487 ___
   2488 }
   2489 $code.=<<___;
   2490 	pxor	%xmm1,%xmm0
   2491 	pshufd	\$0x4e,%xmm0,%xmm1
   2492 	por	%xmm1,%xmm0
   2493 	lea	$STRIDE($bptr),$bptr
   2494 	movq	%xmm0,%rdx		# bp[0]
   2495 	lea	64+8*4+8(%rsp),$tptr
   2496 
   2497 	mov	%rdx,$bi
   2498 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
   2499 	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
   2500 	add	%rax,%r11
   2501 	mulx	2*8($aptr),%rax,%r13	# ...
   2502 	adc	%rax,%r12
   2503 	adc	\$0,%r13
   2504 	mulx	3*8($aptr),%rax,%r14
   2505 
   2506 	mov	$mi,%r15
   2507 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2508 	xor	$zero,$zero		# cf=0, of=0
   2509 	mov	$mi,%rdx
   2510 
   2511 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2512 
   2513 	lea	4*8($aptr),$aptr
   2514 	adcx	%rax,%r13
   2515 	adcx	$zero,%r14		# cf=0
   2516 
   2517 	mulx	0*8($nptr),%rax,%r10
   2518 	adcx	%rax,%r15		# discarded
   2519 	adox	%r11,%r10
   2520 	mulx	1*8($nptr),%rax,%r11
   2521 	adcx	%rax,%r10
   2522 	adox	%r12,%r11
   2523 	mulx	2*8($nptr),%rax,%r12
   2524 	mov	24+8(%rsp),$bptr	# counter value
   2525 	mov	%r10,-8*4($tptr)
   2526 	adcx	%rax,%r11
   2527 	adox	%r13,%r12
   2528 	mulx	3*8($nptr),%rax,%r15
   2529 	 mov	$bi,%rdx
   2530 	mov	%r11,-8*3($tptr)
   2531 	adcx	%rax,%r12
   2532 	adox	$zero,%r15		# of=0
   2533 	lea	4*8($nptr),$nptr
   2534 	mov	%r12,-8*2($tptr)
   2535 	jmp	.Lmulx4x_1st
   2536 
   2537 .align	32
   2538 .Lmulx4x_1st:
   2539 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2540 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
   2541 	adcx	%r14,%r10
   2542 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
   2543 	adcx	%rax,%r11
   2544 	mulx	2*8($aptr),%r12,%rax	# ...
   2545 	adcx	%r14,%r12
   2546 	mulx	3*8($aptr),%r13,%r14
   2547 	 .byte	0x67,0x67
   2548 	 mov	$mi,%rdx
   2549 	adcx	%rax,%r13
   2550 	adcx	$zero,%r14		# cf=0
   2551 	lea	4*8($aptr),$aptr
   2552 	lea	4*8($tptr),$tptr
   2553 
   2554 	adox	%r15,%r10
   2555 	mulx	0*8($nptr),%rax,%r15
   2556 	adcx	%rax,%r10
   2557 	adox	%r15,%r11
   2558 	mulx	1*8($nptr),%rax,%r15
   2559 	adcx	%rax,%r11
   2560 	adox	%r15,%r12
   2561 	mulx	2*8($nptr),%rax,%r15
   2562 	mov	%r10,-5*8($tptr)
   2563 	adcx	%rax,%r12
   2564 	mov	%r11,-4*8($tptr)
   2565 	adox	%r15,%r13
   2566 	mulx	3*8($nptr),%rax,%r15
   2567 	 mov	$bi,%rdx
   2568 	mov	%r12,-3*8($tptr)
   2569 	adcx	%rax,%r13
   2570 	adox	$zero,%r15
   2571 	lea	4*8($nptr),$nptr
   2572 	mov	%r13,-2*8($tptr)
   2573 
   2574 	dec	$bptr			# of=0, pass cf
   2575 	jnz	.Lmulx4x_1st
   2576 
   2577 	mov	8(%rsp),$num		# load -num
   2578 	adc	$zero,%r15		# modulo-scheduled
   2579 	lea	($aptr,$num),$aptr	# rewind $aptr
   2580 	add	%r15,%r14
   2581 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2582 	adc	$zero,$zero		# top-most carry
   2583 	mov	%r14,-1*8($tptr)
   2584 	jmp	.Lmulx4x_outer
   2585 
   2586 .align	32
   2587 .Lmulx4x_outer:
   2588 	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
   2589 	pxor	%xmm4,%xmm4
   2590 	.byte	0x67,0x67
   2591 	pxor	%xmm5,%xmm5
   2592 ___
   2593 for($i=0;$i<$STRIDE/16;$i+=4) {
   2594 $code.=<<___;
   2595 	movdqa	`16*($i+0)-128`($bptr),%xmm0
   2596 	movdqa	`16*($i+1)-128`($bptr),%xmm1
   2597 	movdqa	`16*($i+2)-128`($bptr),%xmm2
   2598 	pand	`16*($i+0)+256`(%r10),%xmm0
   2599 	movdqa	`16*($i+3)-128`($bptr),%xmm3
   2600 	pand	`16*($i+1)+256`(%r10),%xmm1
   2601 	por	%xmm0,%xmm4
   2602 	pand	`16*($i+2)+256`(%r10),%xmm2
   2603 	por	%xmm1,%xmm5
   2604 	pand	`16*($i+3)+256`(%r10),%xmm3
   2605 	por	%xmm2,%xmm4
   2606 	por	%xmm3,%xmm5
   2607 ___
   2608 }
   2609 $code.=<<___;
   2610 	por	%xmm5,%xmm4
   2611 	pshufd	\$0x4e,%xmm4,%xmm0
   2612 	por	%xmm4,%xmm0
   2613 	lea	$STRIDE($bptr),$bptr
   2614 	movq	%xmm0,%rdx		# m0=bp[i]
   2615 
   2616 	mov	$zero,($tptr)		# save top-most carry
   2617 	lea	4*8($tptr,$num),$tptr	# rewind $tptr
   2618 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
   2619 	xor	$zero,$zero		# cf=0, of=0
   2620 	mov	%rdx,$bi
   2621 	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
   2622 	adox	-4*8($tptr),$mi		# +t[0]
   2623 	adcx	%r14,%r11
   2624 	mulx	2*8($aptr),%r15,%r13	# ...
   2625 	adox	-3*8($tptr),%r11
   2626 	adcx	%r15,%r12
   2627 	mulx	3*8($aptr),%rdx,%r14
   2628 	adox	-2*8($tptr),%r12
   2629 	adcx	%rdx,%r13
   2630 	lea	($nptr,$num),$nptr	# rewind $nptr
   2631 	lea	4*8($aptr),$aptr
   2632 	adox	-1*8($tptr),%r13
   2633 	adcx	$zero,%r14
   2634 	adox	$zero,%r14
   2635 
   2636 	mov	$mi,%r15
   2637 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
   2638 
   2639 	mov	$mi,%rdx
   2640 	xor	$zero,$zero		# cf=0, of=0
   2641 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
   2642 
   2643 	mulx	0*8($nptr),%rax,%r10
   2644 	adcx	%rax,%r15		# discarded
   2645 	adox	%r11,%r10
   2646 	mulx	1*8($nptr),%rax,%r11
   2647 	adcx	%rax,%r10
   2648 	adox	%r12,%r11
   2649 	mulx	2*8($nptr),%rax,%r12
   2650 	adcx	%rax,%r11
   2651 	adox	%r13,%r12
   2652 	mulx	3*8($nptr),%rax,%r15
   2653 	 mov	$bi,%rdx
   2654 	mov	24+8(%rsp),$bptr	# counter value
   2655 	mov	%r10,-8*4($tptr)
   2656 	adcx	%rax,%r12
   2657 	mov	%r11,-8*3($tptr)
   2658 	adox	$zero,%r15		# of=0
   2659 	mov	%r12,-8*2($tptr)
   2660 	lea	4*8($nptr),$nptr
   2661 	jmp	.Lmulx4x_inner
   2662 
   2663 .align	32
   2664 .Lmulx4x_inner:
   2665 	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
   2666 	adcx	$zero,%r15		# cf=0, modulo-scheduled
   2667 	adox	%r14,%r10
   2668 	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
   2669 	adcx	0*8($tptr),%r10
   2670 	adox	%rax,%r11
   2671 	mulx	2*8($aptr),%r12,%rax	# ...
   2672 	adcx	1*8($tptr),%r11
   2673 	adox	%r14,%r12
   2674 	mulx	3*8($aptr),%r13,%r14
   2675 	 mov	$mi,%rdx
   2676 	adcx	2*8($tptr),%r12
   2677 	adox	%rax,%r13
   2678 	adcx	3*8($tptr),%r13
   2679 	adox	$zero,%r14		# of=0
   2680 	lea	4*8($aptr),$aptr
   2681 	lea	4*8($tptr),$tptr
   2682 	adcx	$zero,%r14		# cf=0
   2683 
   2684 	adox	%r15,%r10
   2685 	mulx	0*8($nptr),%rax,%r15
   2686 	adcx	%rax,%r10
   2687 	adox	%r15,%r11
   2688 	mulx	1*8($nptr),%rax,%r15
   2689 	adcx	%rax,%r11
   2690 	adox	%r15,%r12
   2691 	mulx	2*8($nptr),%rax,%r15
   2692 	mov	%r10,-5*8($tptr)
   2693 	adcx	%rax,%r12
   2694 	adox	%r15,%r13
   2695 	mov	%r11,-4*8($tptr)
   2696 	mulx	3*8($nptr),%rax,%r15
   2697 	 mov	$bi,%rdx
   2698 	lea	4*8($nptr),$nptr
   2699 	mov	%r12,-3*8($tptr)
   2700 	adcx	%rax,%r13
   2701 	adox	$zero,%r15
   2702 	mov	%r13,-2*8($tptr)
   2703 
   2704 	dec	$bptr			# of=0, pass cf
   2705 	jnz	.Lmulx4x_inner
   2706 
   2707 	mov	0+8(%rsp),$num		# load -num
   2708 	adc	$zero,%r15		# modulo-scheduled
   2709 	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
   2710 	mov	8+8(%rsp),$bptr		# re-load &b[i]
   2711 	mov	16+8(%rsp),%r10
   2712 	adc	%r15,%r14
   2713 	lea	($aptr,$num),$aptr	# rewind $aptr
   2714 	adc	$zero,$zero		# top-most carry
   2715 	mov	%r14,-1*8($tptr)
   2716 
   2717 	cmp	%r10,$bptr
   2718 	jb	.Lmulx4x_outer
   2719 
   2720 	mov	-8($nptr),%r10
   2721 	mov	$zero,%r8
   2722 	mov	($nptr,$num),%r12
   2723 	lea	($nptr,$num),%rbp	# rewind $nptr
   2724 	mov	$num,%rcx
   2725 	lea	($tptr,$num),%rdi	# rewind $tptr
   2726 	xor	%eax,%eax
   2727 	xor	%r15,%r15
   2728 	sub	%r14,%r10		# compare top-most words
   2729 	adc	%r15,%r15
   2730 	or	%r15,%r8
   2731 	sar	\$3+2,%rcx
   2732 	sub	%r8,%rax		# %rax=-%r8
   2733 	mov	56+8(%rsp),%rdx		# restore rp
   2734 	dec	%r12			# so that after 'not' we get -n[0]
   2735 	mov	8*1(%rbp),%r13
   2736 	xor	%r8,%r8
   2737 	mov	8*2(%rbp),%r14
   2738 	mov	8*3(%rbp),%r15
   2739 	jmp	.Lsqrx4x_sub_entry	# common post-condition
   2740 .size	mulx4x_internal,.-mulx4x_internal
   2741 ___
   2742 }{
   2744 ######################################################################
   2745 # void bn_power5(
   2746 my $rptr="%rdi";	# BN_ULONG *rptr,
   2747 my $aptr="%rsi";	# const BN_ULONG *aptr,
   2748 my $bptr="%rdx";	# const void *table,
   2749 my $nptr="%rcx";	# const BN_ULONG *nptr,
   2750 my $n0  ="%r8";		# const BN_ULONG *n0);
   2751 my $num ="%r9";		# int num, has to be divisible by 8
   2752 			# int pwr);
   2753 
   2754 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
   2755 my @A0=("%r10","%r11");
   2756 my @A1=("%r12","%r13");
   2757 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
   2758 
   2759 $code.=<<___;
   2760 .type	bn_powerx5,\@function,6
   2761 .align	32
   2762 bn_powerx5:
   2763 .cfi_startproc
   2764 	mov	%rsp,%rax
   2765 .cfi_def_cfa_register	%rax
   2766 .Lpowerx5_enter:
   2767 	push	%rbx
   2768 .cfi_push	%rbx
   2769 	push	%rbp
   2770 .cfi_push	%rbp
   2771 	push	%r12
   2772 .cfi_push	%r12
   2773 	push	%r13
   2774 .cfi_push	%r13
   2775 	push	%r14
   2776 .cfi_push	%r14
   2777 	push	%r15
   2778 .cfi_push	%r15
   2779 .Lpowerx5_prologue:
   2780 
   2781 	shl	\$3,${num}d		# convert $num to bytes
   2782 	lea	($num,$num,2),%r10	# 3*$num in bytes
   2783 	neg	$num
   2784 	mov	($n0),$n0		# *n0
   2785 
   2786 	##############################################################
   2787 	# Ensure that stack frame doesn't alias with $rptr+3*$num
   2788 	# modulo 4096, which covers ret[num], am[num] and n[num]
   2789 	# (see bn_exp.c). This is done to allow memory disambiguation
   2790 	# logic do its magic. [Extra 256 bytes is for power mask
   2791 	# calculated from 7th argument, the index.]
   2792 	#
   2793 	lea	-320(%rsp,$num,2),%r11
   2794 	mov	%rsp,%rbp
   2795 	sub	$rptr,%r11
   2796 	and	\$4095,%r11
   2797 	cmp	%r11,%r10
   2798 	jb	.Lpwrx_sp_alt
   2799 	sub	%r11,%rbp		# align with $aptr
   2800 	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
   2801 	jmp	.Lpwrx_sp_done
   2802 
   2803 .align	32
   2804 .Lpwrx_sp_alt:
   2805 	lea	4096-320(,$num,2),%r10
   2806 	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
   2807 	sub	%r10,%r11
   2808 	mov	\$0,%r10
   2809 	cmovc	%r10,%r11
   2810 	sub	%r11,%rbp
   2811 .Lpwrx_sp_done:
   2812 	and	\$-64,%rbp
   2813 	mov	%rsp,%r11
   2814 	sub	%rbp,%r11
   2815 	and	\$-4096,%r11
   2816 	lea	(%rbp,%r11),%rsp
   2817 	mov	(%rsp),%r10
   2818 	cmp	%rbp,%rsp
   2819 	ja	.Lpwrx_page_walk
   2820 	jmp	.Lpwrx_page_walk_done
   2821 
   2822 .Lpwrx_page_walk:
   2823 	lea	-4096(%rsp),%rsp
   2824 	mov	(%rsp),%r10
   2825 	cmp	%rbp,%rsp
   2826 	ja	.Lpwrx_page_walk
   2827 .Lpwrx_page_walk_done:
   2828 
   2829 	mov	$num,%r10
   2830 	neg	$num
   2831 
   2832 	##############################################################
   2833 	# Stack layout
   2834 	#
   2835 	# +0	saved $num, used in reduction section
   2836 	# +8	&t[2*$num], used in reduction section
   2837 	# +16	intermediate carry bit
   2838 	# +24	top-most carry bit, used in reduction section
   2839 	# +32	saved *n0
   2840 	# +40	saved %rsp
   2841 	# +48	t[2*$num]
   2842 	#
   2843 	pxor	%xmm0,%xmm0
   2844 	movq	$rptr,%xmm1		# save $rptr
   2845 	movq	$nptr,%xmm2		# save $nptr
   2846 	movq	%r10, %xmm3		# -$num
   2847 	movq	$bptr,%xmm4
   2848 	mov	$n0,  32(%rsp)
   2849 	mov	%rax, 40(%rsp)		# save original %rsp
   2850 .cfi_cfa_expression	%rsp+40,deref,+8
   2851 .Lpowerx5_body:
   2852 
   2853 	call	__bn_sqrx8x_internal
   2854 	call	__bn_postx4x_internal
   2855 	call	__bn_sqrx8x_internal
   2856 	call	__bn_postx4x_internal
   2857 	call	__bn_sqrx8x_internal
   2858 	call	__bn_postx4x_internal
   2859 	call	__bn_sqrx8x_internal
   2860 	call	__bn_postx4x_internal
   2861 	call	__bn_sqrx8x_internal
   2862 	call	__bn_postx4x_internal
   2863 
   2864 	mov	%r10,$num		# -num
   2865 	mov	$aptr,$rptr
   2866 	movq	%xmm2,$nptr
   2867 	movq	%xmm4,$bptr
   2868 	mov	40(%rsp),%rax
   2869 
   2870 	call	mulx4x_internal
   2871 
   2872 	mov	40(%rsp),%rsi		# restore %rsp
   2873 .cfi_def_cfa	%rsi,8
   2874 	mov	\$1,%rax
   2875 
   2876 	mov	-48(%rsi),%r15
   2877 .cfi_restore	%r15
   2878 	mov	-40(%rsi),%r14
   2879 .cfi_restore	%r14
   2880 	mov	-32(%rsi),%r13
   2881 .cfi_restore	%r13
   2882 	mov	-24(%rsi),%r12
   2883 .cfi_restore	%r12
   2884 	mov	-16(%rsi),%rbp
   2885 .cfi_restore	%rbp
   2886 	mov	-8(%rsi),%rbx
   2887 .cfi_restore	%rbx
   2888 	lea	(%rsi),%rsp
   2889 .cfi_def_cfa_register	%rsp
   2890 .Lpowerx5_epilogue:
   2891 	ret
   2892 .cfi_endproc
   2893 .size	bn_powerx5,.-bn_powerx5
   2894 
   2895 .globl	bn_sqrx8x_internal
   2896 .hidden	bn_sqrx8x_internal
   2897 .type	bn_sqrx8x_internal,\@abi-omnipotent
   2898 .align	32
   2899 bn_sqrx8x_internal:
   2900 __bn_sqrx8x_internal:
   2901 	##################################################################
   2902 	# Squaring part:
   2903 	#
   2904 	# a) multiply-n-add everything but a[i]*a[i];
   2905 	# b) shift result of a) by 1 to the left and accumulate
   2906 	#    a[i]*a[i] products;
   2907 	#
   2908 	##################################################################
   2909 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2910 	#                                                     a[1]a[0]
   2911 	#                                                 a[2]a[0]
   2912 	#                                             a[3]a[0]
   2913 	#                                             a[2]a[1]
   2914 	#                                         a[3]a[1]
   2915 	#                                     a[3]a[2]
   2916 	#
   2917 	#                                         a[4]a[0]
   2918 	#                                     a[5]a[0]
   2919 	#                                 a[6]a[0]
   2920 	#                             a[7]a[0]
   2921 	#                                     a[4]a[1]
   2922 	#                                 a[5]a[1]
   2923 	#                             a[6]a[1]
   2924 	#                         a[7]a[1]
   2925 	#                                 a[4]a[2]
   2926 	#                             a[5]a[2]
   2927 	#                         a[6]a[2]
   2928 	#                     a[7]a[2]
   2929 	#                             a[4]a[3]
   2930 	#                         a[5]a[3]
   2931 	#                     a[6]a[3]
   2932 	#                 a[7]a[3]
   2933 	#
   2934 	#                     a[5]a[4]
   2935 	#                 a[6]a[4]
   2936 	#             a[7]a[4]
   2937 	#             a[6]a[5]
   2938 	#         a[7]a[5]
   2939 	#     a[7]a[6]
   2940 	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
   2941 ___
   2942 {
   2943 my ($zero,$carry)=("%rbp","%rcx");
   2944 my $aaptr=$zero;
   2945 $code.=<<___;
   2946 	lea	48+8(%rsp),$tptr
   2947 	lea	($aptr,$num),$aaptr
   2948 	mov	$num,0+8(%rsp)			# save $num
   2949 	mov	$aaptr,8+8(%rsp)		# save end of $aptr
   2950 	jmp	.Lsqr8x_zero_start
   2951 
   2952 .align	32
   2953 .byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
   2954 .Lsqrx8x_zero:
   2955 	.byte	0x3e
   2956 	movdqa	%xmm0,0*8($tptr)
   2957 	movdqa	%xmm0,2*8($tptr)
   2958 	movdqa	%xmm0,4*8($tptr)
   2959 	movdqa	%xmm0,6*8($tptr)
   2960 .Lsqr8x_zero_start:			# aligned at 32
   2961 	movdqa	%xmm0,8*8($tptr)
   2962 	movdqa	%xmm0,10*8($tptr)
   2963 	movdqa	%xmm0,12*8($tptr)
   2964 	movdqa	%xmm0,14*8($tptr)
   2965 	lea	16*8($tptr),$tptr
   2966 	sub	\$64,$num
   2967 	jnz	.Lsqrx8x_zero
   2968 
   2969 	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
   2970 	#xor	%r9,%r9			# t[1], ex-$num, zero already
   2971 	xor	%r10,%r10
   2972 	xor	%r11,%r11
   2973 	xor	%r12,%r12
   2974 	xor	%r13,%r13
   2975 	xor	%r14,%r14
   2976 	xor	%r15,%r15
   2977 	lea	48+8(%rsp),$tptr
   2978 	xor	$zero,$zero		# cf=0, cf=0
   2979 	jmp	.Lsqrx8x_outer_loop
   2980 
   2981 .align	32
   2982 .Lsqrx8x_outer_loop:
   2983 	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
   2984 	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
   2985 	adox	%rax,%r10
   2986 	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
   2987 	adcx	%r10,%r9
   2988 	adox	%rax,%r11
   2989 	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
   2990 	adcx	%r11,%r10
   2991 	adox	%rax,%r12
   2992 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
   2993 	adcx	%r12,%r11
   2994 	adox	%rax,%r13
   2995 	mulx	5*8($aptr),%r12,%rax
   2996 	adcx	%r13,%r12
   2997 	adox	%rax,%r14
   2998 	mulx	6*8($aptr),%r13,%rax
   2999 	adcx	%r14,%r13
   3000 	adox	%r15,%rax
   3001 	mulx	7*8($aptr),%r14,%r15
   3002 	 mov	1*8($aptr),%rdx		# a[1]
   3003 	adcx	%rax,%r14
   3004 	adox	$zero,%r15
   3005 	adc	8*8($tptr),%r15
   3006 	mov	%r8,1*8($tptr)		# t[1]
   3007 	mov	%r9,2*8($tptr)		# t[2]
   3008 	sbb	$carry,$carry		# mov %cf,$carry
   3009 	xor	$zero,$zero		# cf=0, of=0
   3010 
   3011 
   3012 	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
   3013 	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
   3014 	adcx	%r10,%r8
   3015 	adox	%rbx,%r9
   3016 	mulx	4*8($aptr),%r10,%rbx	# ...
   3017 	adcx	%r11,%r9
   3018 	adox	%rax,%r10
   3019 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
   3020 	adcx	%r12,%r10
   3021 	adox	%rbx,%r11
   3022 	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
   3023 	adcx	%r13,%r11
   3024 	adox	%r14,%r12
   3025 	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
   3026 	 mov	2*8($aptr),%rdx		# a[2]
   3027 	adcx	%rax,%r12
   3028 	adox	%rbx,%r13
   3029 	adcx	%r15,%r13
   3030 	adox	$zero,%r14		# of=0
   3031 	adcx	$zero,%r14		# cf=0
   3032 
   3033 	mov	%r8,3*8($tptr)		# t[3]
   3034 	mov	%r9,4*8($tptr)		# t[4]
   3035 
   3036 	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
   3037 	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
   3038 	adcx	%r10,%r8
   3039 	adox	%rbx,%r9
   3040 	mulx	5*8($aptr),%r10,%rbx	# ...
   3041 	adcx	%r11,%r9
   3042 	adox	%rax,%r10
   3043 	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
   3044 	adcx	%r12,%r10
   3045 	adox	%r13,%r11
   3046 	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
   3047 	.byte	0x3e
   3048 	 mov	3*8($aptr),%rdx		# a[3]
   3049 	adcx	%rbx,%r11
   3050 	adox	%rax,%r12
   3051 	adcx	%r14,%r12
   3052 	mov	%r8,5*8($tptr)		# t[5]
   3053 	mov	%r9,6*8($tptr)		# t[6]
   3054 	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
   3055 	adox	$zero,%r13		# of=0
   3056 	adcx	$zero,%r13		# cf=0
   3057 
   3058 	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
   3059 	adcx	%r10,%r8
   3060 	adox	%rax,%r9
   3061 	mulx	6*8($aptr),%r10,%rax	# ...
   3062 	adcx	%r11,%r9
   3063 	adox	%r12,%r10
   3064 	mulx	7*8($aptr),%r11,%r12
   3065 	 mov	4*8($aptr),%rdx		# a[4]
   3066 	 mov	5*8($aptr),%r14		# a[5]
   3067 	adcx	%rbx,%r10
   3068 	adox	%rax,%r11
   3069 	 mov	6*8($aptr),%r15		# a[6]
   3070 	adcx	%r13,%r11
   3071 	adox	$zero,%r12		# of=0
   3072 	adcx	$zero,%r12		# cf=0
   3073 
   3074 	mov	%r8,7*8($tptr)		# t[7]
   3075 	mov	%r9,8*8($tptr)		# t[8]
   3076 
   3077 	mulx	%r14,%r9,%rax		# a[5]*a[4]
   3078 	 mov	7*8($aptr),%r8		# a[7]
   3079 	adcx	%r10,%r9
   3080 	mulx	%r15,%r10,%rbx		# a[6]*a[4]
   3081 	adox	%rax,%r10
   3082 	adcx	%r11,%r10
   3083 	mulx	%r8,%r11,%rax		# a[7]*a[4]
   3084 	 mov	%r14,%rdx		# a[5]
   3085 	adox	%rbx,%r11
   3086 	adcx	%r12,%r11
   3087 	#adox	$zero,%rax		# of=0
   3088 	adcx	$zero,%rax		# cf=0
   3089 
   3090 	mulx	%r15,%r14,%rbx		# a[6]*a[5]
   3091 	mulx	%r8,%r12,%r13		# a[7]*a[5]
   3092 	 mov	%r15,%rdx		# a[6]
   3093 	 lea	8*8($aptr),$aptr
   3094 	adcx	%r14,%r11
   3095 	adox	%rbx,%r12
   3096 	adcx	%rax,%r12
   3097 	adox	$zero,%r13
   3098 
   3099 	.byte	0x67,0x67
   3100 	mulx	%r8,%r8,%r14		# a[7]*a[6]
   3101 	adcx	%r8,%r13
   3102 	adcx	$zero,%r14
   3103 
   3104 	cmp	8+8(%rsp),$aptr
   3105 	je	.Lsqrx8x_outer_break
   3106 
   3107 	neg	$carry			# mov $carry,%cf
   3108 	mov	\$-8,%rcx
   3109 	mov	$zero,%r15
   3110 	mov	8*8($tptr),%r8
   3111 	adcx	9*8($tptr),%r9		# +=t[9]
   3112 	adcx	10*8($tptr),%r10	# ...
   3113 	adcx	11*8($tptr),%r11
   3114 	adc	12*8($tptr),%r12
   3115 	adc	13*8($tptr),%r13
   3116 	adc	14*8($tptr),%r14
   3117 	adc	15*8($tptr),%r15
   3118 	lea	($aptr),$aaptr
   3119 	lea	2*64($tptr),$tptr
   3120 	sbb	%rax,%rax		# mov %cf,$carry
   3121 
   3122 	mov	-64($aptr),%rdx		# a[0]
   3123 	mov	%rax,16+8(%rsp)		# offload $carry
   3124 	mov	$tptr,24+8(%rsp)
   3125 
   3126 	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
   3127 	xor	%eax,%eax		# cf=0, of=0
   3128 	jmp	.Lsqrx8x_loop
   3129 
   3130 .align	32
   3131 .Lsqrx8x_loop:
   3132 	mov	%r8,%rbx
   3133 	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
   3134 	adcx	%rax,%rbx		# +=t[8]
   3135 	adox	%r9,%r8
   3136 
   3137 	mulx	1*8($aaptr),%rax,%r9	# ...
   3138 	adcx	%rax,%r8
   3139 	adox	%r10,%r9
   3140 
   3141 	mulx	2*8($aaptr),%rax,%r10
   3142 	adcx	%rax,%r9
   3143 	adox	%r11,%r10
   3144 
   3145 	mulx	3*8($aaptr),%rax,%r11
   3146 	adcx	%rax,%r10
   3147 	adox	%r12,%r11
   3148 
   3149 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
   3150 	adcx	%rax,%r11
   3151 	adox	%r13,%r12
   3152 
   3153 	mulx	5*8($aaptr),%rax,%r13
   3154 	adcx	%rax,%r12
   3155 	adox	%r14,%r13
   3156 
   3157 	mulx	6*8($aaptr),%rax,%r14
   3158 	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
   3159 	 mov	\$0,%ebx
   3160 	adcx	%rax,%r13
   3161 	adox	%r15,%r14
   3162 
   3163 	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
   3164 	 mov	8($aptr,%rcx,8),%rdx	# a[i]
   3165 	adcx	%rax,%r14
   3166 	adox	%rbx,%r15		# %rbx is 0, of=0
   3167 	adcx	%rbx,%r15		# cf=0
   3168 
   3169 	.byte	0x67
   3170 	inc	%rcx			# of=0
   3171 	jnz	.Lsqrx8x_loop
   3172 
   3173 	lea	8*8($aaptr),$aaptr
   3174 	mov	\$-8,%rcx
   3175 	cmp	8+8(%rsp),$aaptr	# done?
   3176 	je	.Lsqrx8x_break
   3177 
   3178 	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
   3179 	.byte	0x66
   3180 	mov	-64($aptr),%rdx
   3181 	adcx	0*8($tptr),%r8
   3182 	adcx	1*8($tptr),%r9
   3183 	adc	2*8($tptr),%r10
   3184 	adc	3*8($tptr),%r11
   3185 	adc	4*8($tptr),%r12
   3186 	adc	5*8($tptr),%r13
   3187 	adc	6*8($tptr),%r14
   3188 	adc	7*8($tptr),%r15
   3189 	lea	8*8($tptr),$tptr
   3190 	.byte	0x67
   3191 	sbb	%rax,%rax		# mov %cf,%rax
   3192 	xor	%ebx,%ebx		# cf=0, of=0
   3193 	mov	%rax,16+8(%rsp)		# offload carry
   3194 	jmp	.Lsqrx8x_loop
   3195 
   3196 .align	32
   3197 .Lsqrx8x_break:
   3198 	xor	$zero,$zero
   3199 	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
   3200 	adcx	$zero,%r8
   3201 	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
   3202 	adcx	$zero,%r9
   3203 	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
   3204 	adc	\$0,%r10
   3205 	mov	%r8,0*8($tptr)
   3206 	adc	\$0,%r11
   3207 	adc	\$0,%r12
   3208 	adc	\$0,%r13
   3209 	adc	\$0,%r14
   3210 	adc	\$0,%r15
   3211 	cmp	$carry,$tptr		# cf=0, of=0
   3212 	je	.Lsqrx8x_outer_loop
   3213 
   3214 	mov	%r9,1*8($tptr)
   3215 	 mov	1*8($carry),%r9
   3216 	mov	%r10,2*8($tptr)
   3217 	 mov	2*8($carry),%r10
   3218 	mov	%r11,3*8($tptr)
   3219 	 mov	3*8($carry),%r11
   3220 	mov	%r12,4*8($tptr)
   3221 	 mov	4*8($carry),%r12
   3222 	mov	%r13,5*8($tptr)
   3223 	 mov	5*8($carry),%r13
   3224 	mov	%r14,6*8($tptr)
   3225 	 mov	6*8($carry),%r14
   3226 	mov	%r15,7*8($tptr)
   3227 	 mov	7*8($carry),%r15
   3228 	mov	$carry,$tptr
   3229 	jmp	.Lsqrx8x_outer_loop
   3230 
   3231 .align	32
   3232 .Lsqrx8x_outer_break:
   3233 	mov	%r9,9*8($tptr)		# t[9]
   3234 	 movq	%xmm3,%rcx		# -$num
   3235 	mov	%r10,10*8($tptr)	# ...
   3236 	mov	%r11,11*8($tptr)
   3237 	mov	%r12,12*8($tptr)
   3238 	mov	%r13,13*8($tptr)
   3239 	mov	%r14,14*8($tptr)
   3240 ___
   3241 }{
   3243 my $i="%rcx";
   3244 $code.=<<___;
   3245 	lea	48+8(%rsp),$tptr
   3246 	mov	($aptr,$i),%rdx		# a[0]
   3247 
   3248 	mov	8($tptr),$A0[1]		# t[1]
   3249 	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
   3250 	mov	0+8(%rsp),$num		# restore $num
   3251 	adox	$A0[1],$A0[1]
   3252 	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
   3253 	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
   3254 	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
   3255 
   3256 .align	32
   3257 .Lsqrx4x_shift_n_add:
   3258 	mulx	%rdx,%rax,%rbx
   3259 	 adox	$A1[0],$A1[0]
   3260 	adcx	$A0[0],%rax
   3261 	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
   3262 	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
   3263 	 adox	$A1[1],$A1[1]
   3264 	adcx	$A0[1],%rbx
   3265 	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
   3266 	mov	%rax,0($tptr)
   3267 	mov	%rbx,8($tptr)
   3268 
   3269 	mulx	%rdx,%rax,%rbx
   3270 	 adox	$A0[0],$A0[0]
   3271 	adcx	$A1[0],%rax
   3272 	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
   3273 	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
   3274 	 adox	$A0[1],$A0[1]
   3275 	adcx	$A1[1],%rbx
   3276 	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
   3277 	mov	%rax,16($tptr)
   3278 	mov	%rbx,24($tptr)
   3279 
   3280 	mulx	%rdx,%rax,%rbx
   3281 	 adox	$A1[0],$A1[0]
   3282 	adcx	$A0[0],%rax
   3283 	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
   3284 	 lea	32($i),$i
   3285 	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
   3286 	 adox	$A1[1],$A1[1]
   3287 	adcx	$A0[1],%rbx
   3288 	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
   3289 	mov	%rax,32($tptr)
   3290 	mov	%rbx,40($tptr)
   3291 
   3292 	mulx	%rdx,%rax,%rbx
   3293 	 adox	$A0[0],$A0[0]
   3294 	adcx	$A1[0],%rax
   3295 	jrcxz	.Lsqrx4x_shift_n_add_break
   3296 	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
   3297 	 adox	$A0[1],$A0[1]
   3298 	adcx	$A1[1],%rbx
   3299 	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
   3300 	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
   3301 	mov	%rax,48($tptr)
   3302 	mov	%rbx,56($tptr)
   3303 	lea	64($tptr),$tptr
   3304 	nop
   3305 	jmp	.Lsqrx4x_shift_n_add
   3306 
   3307 .align	32
   3308 .Lsqrx4x_shift_n_add_break:
   3309 	adcx	$A1[1],%rbx
   3310 	mov	%rax,48($tptr)
   3311 	mov	%rbx,56($tptr)
   3312 	lea	64($tptr),$tptr		# end of t[] buffer
   3313 ___
   3314 }
   3316 ######################################################################
   3317 # Montgomery reduction part, "word-by-word" algorithm.
   3318 #
   3319 # This new path is inspired by multiple submissions from Intel, by
   3320 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
   3321 # Vinodh Gopal...
   3322 {
   3323 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
   3324 
   3325 $code.=<<___;
   3326 	movq	%xmm2,$nptr
   3327 __bn_sqrx8x_reduction:
   3328 	xor	%eax,%eax		# initial top-most carry bit
   3329 	mov	32+8(%rsp),%rbx		# n0
   3330 	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
   3331 	lea	-8*8($nptr,$num),%rcx	# end of n[]
   3332 	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
   3333 	mov	%rcx, 0+8(%rsp)		# save end of n[]
   3334 	mov	$tptr,8+8(%rsp)		# save end of t[]
   3335 
   3336 	lea	48+8(%rsp),$tptr		# initial t[] window
   3337 	jmp	.Lsqrx8x_reduction_loop
   3338 
   3339 .align	32
   3340 .Lsqrx8x_reduction_loop:
   3341 	mov	8*1($tptr),%r9
   3342 	mov	8*2($tptr),%r10
   3343 	mov	8*3($tptr),%r11
   3344 	mov	8*4($tptr),%r12
   3345 	mov	%rdx,%r8
   3346 	imulq	%rbx,%rdx		# n0*a[i]
   3347 	mov	8*5($tptr),%r13
   3348 	mov	8*6($tptr),%r14
   3349 	mov	8*7($tptr),%r15
   3350 	mov	%rax,24+8(%rsp)		# store top-most carry bit
   3351 
   3352 	lea	8*8($tptr),$tptr
   3353 	xor	$carry,$carry		# cf=0,of=0
   3354 	mov	\$-8,%rcx
   3355 	jmp	.Lsqrx8x_reduce
   3356 
   3357 .align	32
   3358 .Lsqrx8x_reduce:
   3359 	mov	%r8, %rbx
   3360 	mulx	8*0($nptr),%rax,%r8	# n[0]
   3361 	adcx	%rbx,%rax		# discarded
   3362 	adox	%r9,%r8
   3363 
   3364 	mulx	8*1($nptr),%rbx,%r9	# n[1]
   3365 	adcx	%rbx,%r8
   3366 	adox	%r10,%r9
   3367 
   3368 	mulx	8*2($nptr),%rbx,%r10
   3369 	adcx	%rbx,%r9
   3370 	adox	%r11,%r10
   3371 
   3372 	mulx	8*3($nptr),%rbx,%r11
   3373 	adcx	%rbx,%r10
   3374 	adox	%r12,%r11
   3375 
   3376 	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
   3377 	 mov	%rdx,%rax
   3378 	 mov	%r8,%rdx
   3379 	adcx	%rbx,%r11
   3380 	adox	%r13,%r12
   3381 
   3382 	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
   3383 	 mov	%rax,%rdx
   3384 	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
   3385 
   3386 	mulx	8*5($nptr),%rax,%r13
   3387 	adcx	%rax,%r12
   3388 	adox	%r14,%r13
   3389 
   3390 	mulx	8*6($nptr),%rax,%r14
   3391 	adcx	%rax,%r13
   3392 	adox	%r15,%r14
   3393 
   3394 	mulx	8*7($nptr),%rax,%r15
   3395 	 mov	%rbx,%rdx
   3396 	adcx	%rax,%r14
   3397 	adox	$carry,%r15		# $carry is 0
   3398 	adcx	$carry,%r15		# cf=0
   3399 
   3400 	.byte	0x67,0x67,0x67
   3401 	inc	%rcx			# of=0
   3402 	jnz	.Lsqrx8x_reduce
   3403 
   3404 	mov	$carry,%rax		# xor	%rax,%rax
   3405 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3406 	jae	.Lsqrx8x_no_tail
   3407 
   3408 	mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3409 	add	8*0($tptr),%r8
   3410 	lea	8*8($nptr),$nptr
   3411 	mov	\$-8,%rcx
   3412 	adcx	8*1($tptr),%r9
   3413 	adcx	8*2($tptr),%r10
   3414 	adc	8*3($tptr),%r11
   3415 	adc	8*4($tptr),%r12
   3416 	adc	8*5($tptr),%r13
   3417 	adc	8*6($tptr),%r14
   3418 	adc	8*7($tptr),%r15
   3419 	lea	8*8($tptr),$tptr
   3420 	sbb	%rax,%rax		# top carry
   3421 
   3422 	xor	$carry,$carry		# of=0, cf=0
   3423 	mov	%rax,16+8(%rsp)
   3424 	jmp	.Lsqrx8x_tail
   3425 
   3426 .align	32
   3427 .Lsqrx8x_tail:
   3428 	mov	%r8,%rbx
   3429 	mulx	8*0($nptr),%rax,%r8
   3430 	adcx	%rax,%rbx
   3431 	adox	%r9,%r8
   3432 
   3433 	mulx	8*1($nptr),%rax,%r9
   3434 	adcx	%rax,%r8
   3435 	adox	%r10,%r9
   3436 
   3437 	mulx	8*2($nptr),%rax,%r10
   3438 	adcx	%rax,%r9
   3439 	adox	%r11,%r10
   3440 
   3441 	mulx	8*3($nptr),%rax,%r11
   3442 	adcx	%rax,%r10
   3443 	adox	%r12,%r11
   3444 
   3445 	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
   3446 	adcx	%rax,%r11
   3447 	adox	%r13,%r12
   3448 
   3449 	mulx	8*5($nptr),%rax,%r13
   3450 	adcx	%rax,%r12
   3451 	adox	%r14,%r13
   3452 
   3453 	mulx	8*6($nptr),%rax,%r14
   3454 	adcx	%rax,%r13
   3455 	adox	%r15,%r14
   3456 
   3457 	mulx	8*7($nptr),%rax,%r15
   3458 	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
   3459 	adcx	%rax,%r14
   3460 	adox	$carry,%r15
   3461 	 mov	%rbx,($tptr,%rcx,8)	# save result
   3462 	 mov	%r8,%rbx
   3463 	adcx	$carry,%r15		# cf=0
   3464 
   3465 	inc	%rcx			# of=0
   3466 	jnz	.Lsqrx8x_tail
   3467 
   3468 	cmp	0+8(%rsp),$nptr		# end of n[]?
   3469 	jae	.Lsqrx8x_tail_done	# break out of loop
   3470 
   3471 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3472 	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
   3473 	 lea	8*8($nptr),$nptr
   3474 	adc	8*0($tptr),%r8
   3475 	adc	8*1($tptr),%r9
   3476 	adc	8*2($tptr),%r10
   3477 	adc	8*3($tptr),%r11
   3478 	adc	8*4($tptr),%r12
   3479 	adc	8*5($tptr),%r13
   3480 	adc	8*6($tptr),%r14
   3481 	adc	8*7($tptr),%r15
   3482 	lea	8*8($tptr),$tptr
   3483 	sbb	%rax,%rax
   3484 	sub	\$8,%rcx		# mov	\$-8,%rcx
   3485 
   3486 	xor	$carry,$carry		# of=0, cf=0
   3487 	mov	%rax,16+8(%rsp)
   3488 	jmp	.Lsqrx8x_tail
   3489 
   3490 .align	32
   3491 .Lsqrx8x_tail_done:
   3492 	xor	%rax,%rax
   3493 	add	24+8(%rsp),%r8		# can this overflow?
   3494 	adc	\$0,%r9
   3495 	adc	\$0,%r10
   3496 	adc	\$0,%r11
   3497 	adc	\$0,%r12
   3498 	adc	\$0,%r13
   3499 	adc	\$0,%r14
   3500 	adc	\$0,%r15
   3501 	adc	\$0,%rax
   3502 
   3503 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
   3504 .Lsqrx8x_no_tail:			# %cf is 0 if jumped here
   3505 	adc	8*0($tptr),%r8
   3506 	 movq	%xmm3,%rcx
   3507 	adc	8*1($tptr),%r9
   3508 	 mov	8*7($nptr),$carry
   3509 	 movq	%xmm2,$nptr		# restore $nptr
   3510 	adc	8*2($tptr),%r10
   3511 	adc	8*3($tptr),%r11
   3512 	adc	8*4($tptr),%r12
   3513 	adc	8*5($tptr),%r13
   3514 	adc	8*6($tptr),%r14
   3515 	adc	8*7($tptr),%r15
   3516 	adc	\$0,%rax		# top-most carry
   3517 
   3518 	mov	32+8(%rsp),%rbx		# n0
   3519 	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
   3520 
   3521 	mov	%r8,8*0($tptr)		# store top 512 bits
   3522 	 lea	8*8($tptr),%r8		# borrow %r8
   3523 	mov	%r9,8*1($tptr)
   3524 	mov	%r10,8*2($tptr)
   3525 	mov	%r11,8*3($tptr)
   3526 	mov	%r12,8*4($tptr)
   3527 	mov	%r13,8*5($tptr)
   3528 	mov	%r14,8*6($tptr)
   3529 	mov	%r15,8*7($tptr)
   3530 
   3531 	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
   3532 	cmp	8+8(%rsp),%r8		# end of t[]?
   3533 	jb	.Lsqrx8x_reduction_loop
   3534 	ret
   3535 .size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
   3536 ___
   3537 }
   3539 ##############################################################
   3540 # Post-condition, 4x unrolled
   3541 #
   3542 {
   3543 my ($rptr,$nptr)=("%rdx","%rbp");
   3544 $code.=<<___;
   3545 .align	32
   3546 __bn_postx4x_internal:
   3547 	mov	8*0($nptr),%r12
   3548 	mov	%rcx,%r10		# -$num
   3549 	mov	%rcx,%r9		# -$num
   3550 	neg	%rax
   3551 	sar	\$3+2,%rcx
   3552 	#lea	48+8(%rsp,%r9),$tptr
   3553 	movq	%xmm1,$rptr		# restore $rptr
   3554 	movq	%xmm1,$aptr		# prepare for back-to-back call
   3555 	dec	%r12			# so that after 'not' we get -n[0]
   3556 	mov	8*1($nptr),%r13
   3557 	xor	%r8,%r8
   3558 	mov	8*2($nptr),%r14
   3559 	mov	8*3($nptr),%r15
   3560 	jmp	.Lsqrx4x_sub_entry
   3561 
   3562 .align	16
   3563 .Lsqrx4x_sub:
   3564 	mov	8*0($nptr),%r12
   3565 	mov	8*1($nptr),%r13
   3566 	mov	8*2($nptr),%r14
   3567 	mov	8*3($nptr),%r15
   3568 .Lsqrx4x_sub_entry:
   3569 	andn	%rax,%r12,%r12
   3570 	lea	8*4($nptr),$nptr
   3571 	andn	%rax,%r13,%r13
   3572 	andn	%rax,%r14,%r14
   3573 	andn	%rax,%r15,%r15
   3574 
   3575 	neg	%r8			# mov %r8,%cf
   3576 	adc	8*0($tptr),%r12
   3577 	adc	8*1($tptr),%r13
   3578 	adc	8*2($tptr),%r14
   3579 	adc	8*3($tptr),%r15
   3580 	mov	%r12,8*0($rptr)
   3581 	lea	8*4($tptr),$tptr
   3582 	mov	%r13,8*1($rptr)
   3583 	sbb	%r8,%r8			# mov %cf,%r8
   3584 	mov	%r14,8*2($rptr)
   3585 	mov	%r15,8*3($rptr)
   3586 	lea	8*4($rptr),$rptr
   3587 
   3588 	inc	%rcx
   3589 	jnz	.Lsqrx4x_sub
   3590 
   3591 	neg	%r9			# restore $num
   3592 
   3593 	ret
   3594 .size	__bn_postx4x_internal,.-__bn_postx4x_internal
   3595 ___
   3596 }
   3597 }}}
   3598 {
   3599 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
   3600 				("%rdi","%esi","%rdx","%ecx");  # Unix order
   3601 my $out=$inp;
   3602 my $STRIDE=2**5*8;
   3603 my $N=$STRIDE/4;
   3604 
   3605 $code.=<<___;
   3606 .globl	bn_scatter5
   3607 .type	bn_scatter5,\@abi-omnipotent
   3608 .align	16
   3609 bn_scatter5:
   3610 	cmp	\$0, $num
   3611 	jz	.Lscatter_epilogue
   3612 	lea	($tbl,$idx,8),$tbl
   3613 .Lscatter:
   3614 	mov	($inp),%rax
   3615 	lea	8($inp),$inp
   3616 	mov	%rax,($tbl)
   3617 	lea	32*8($tbl),$tbl
   3618 	sub	\$1,$num
   3619 	jnz	.Lscatter
   3620 .Lscatter_epilogue:
   3621 	ret
   3622 .size	bn_scatter5,.-bn_scatter5
   3623 
   3624 .globl	bn_gather5
   3625 .type	bn_gather5,\@abi-omnipotent
   3626 .align	32
   3627 bn_gather5:
   3628 .LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
   3629 	# I can't trust assembler to use specific encoding:-(
   3630 	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
   3631 	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
   3632 	lea	.Linc(%rip),%rax
   3633 	and	\$-16,%rsp		# shouldn't be formally required
   3634 
   3635 	movd	$idx,%xmm5
   3636 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
   3637 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
   3638 	lea	128($tbl),%r11		# size optimization
   3639 	lea	128(%rsp),%rax		# size optimization
   3640 
   3641 	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
   3642 	movdqa	%xmm1,%xmm4
   3643 	movdqa	%xmm1,%xmm2
   3644 ___
   3645 ########################################################################
   3646 # calculate mask by comparing 0..31 to $idx and save result to stack
   3647 #
   3648 for($i=0;$i<$STRIDE/16;$i+=4) {
   3649 $code.=<<___;
   3650 	paddd	%xmm0,%xmm1
   3651 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
   3652 ___
   3653 $code.=<<___	if ($i);
   3654 	movdqa	%xmm3,`16*($i-1)-128`(%rax)
   3655 ___
   3656 $code.=<<___;
   3657 	movdqa	%xmm4,%xmm3
   3658 
   3659 	paddd	%xmm1,%xmm2
   3660 	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
   3661 	movdqa	%xmm0,`16*($i+0)-128`(%rax)
   3662 	movdqa	%xmm4,%xmm0
   3663 
   3664 	paddd	%xmm2,%xmm3
   3665 	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
   3666 	movdqa	%xmm1,`16*($i+1)-128`(%rax)
   3667 	movdqa	%xmm4,%xmm1
   3668 
   3669 	paddd	%xmm3,%xmm0
   3670 	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
   3671 	movdqa	%xmm2,`16*($i+2)-128`(%rax)
   3672 	movdqa	%xmm4,%xmm2
   3673 ___
   3674 }
   3675 $code.=<<___;
   3676 	movdqa	%xmm3,`16*($i-1)-128`(%rax)
   3677 	jmp	.Lgather
   3678 
   3679 .align	32
   3680 .Lgather:
   3681 	pxor	%xmm4,%xmm4
   3682 	pxor	%xmm5,%xmm5
   3683 ___
   3684 for($i=0;$i<$STRIDE/16;$i+=4) {
   3685 $code.=<<___;
   3686 	movdqa	`16*($i+0)-128`(%r11),%xmm0
   3687 	movdqa	`16*($i+1)-128`(%r11),%xmm1
   3688 	movdqa	`16*($i+2)-128`(%r11),%xmm2
   3689 	pand	`16*($i+0)-128`(%rax),%xmm0
   3690 	movdqa	`16*($i+3)-128`(%r11),%xmm3
   3691 	pand	`16*($i+1)-128`(%rax),%xmm1
   3692 	por	%xmm0,%xmm4
   3693 	pand	`16*($i+2)-128`(%rax),%xmm2
   3694 	por	%xmm1,%xmm5
   3695 	pand	`16*($i+3)-128`(%rax),%xmm3
   3696 	por	%xmm2,%xmm4
   3697 	por	%xmm3,%xmm5
   3698 ___
   3699 }
   3700 $code.=<<___;
   3701 	por	%xmm5,%xmm4
   3702 	lea	$STRIDE(%r11),%r11
   3703 	pshufd	\$0x4e,%xmm4,%xmm0
   3704 	por	%xmm4,%xmm0
   3705 	movq	%xmm0,($out)		# m0=bp[0]
   3706 	lea	8($out),$out
   3707 	sub	\$1,$num
   3708 	jnz	.Lgather
   3709 
   3710 	lea	(%r10),%rsp
   3711 	ret
   3712 .LSEH_end_bn_gather5:
   3713 .size	bn_gather5,.-bn_gather5
   3714 ___
   3715 }
   3716 $code.=<<___;
   3717 .align	64
   3718 .Linc:
   3719 	.long	0,0, 1,1
   3720 	.long	2,2, 2,2
   3721 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   3722 ___
   3723 
   3724 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   3725 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   3726 if ($win64) {
   3727 $rec="%rcx";
   3728 $frame="%rdx";
   3729 $context="%r8";
   3730 $disp="%r9";
   3731 
   3732 $code.=<<___;
   3733 .extern	__imp_RtlVirtualUnwind
   3734 .type	mul_handler,\@abi-omnipotent
   3735 .align	16
   3736 mul_handler:
   3737 	push	%rsi
   3738 	push	%rdi
   3739 	push	%rbx
   3740 	push	%rbp
   3741 	push	%r12
   3742 	push	%r13
   3743 	push	%r14
   3744 	push	%r15
   3745 	pushfq
   3746 	sub	\$64,%rsp
   3747 
   3748 	mov	120($context),%rax	# pull context->Rax
   3749 	mov	248($context),%rbx	# pull context->Rip
   3750 
   3751 	mov	8($disp),%rsi		# disp->ImageBase
   3752 	mov	56($disp),%r11		# disp->HandlerData
   3753 
   3754 	mov	0(%r11),%r10d		# HandlerData[0]
   3755 	lea	(%rsi,%r10),%r10	# end of prologue label
   3756 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   3757 	jb	.Lcommon_seh_tail
   3758 
   3759 	mov	4(%r11),%r10d		# HandlerData[1]
   3760 	lea	(%rsi,%r10),%r10	# beginning of body label
   3761 	cmp	%r10,%rbx		# context->Rip<body label
   3762 	jb	.Lcommon_pop_regs
   3763 
   3764 	mov	152($context),%rax	# pull context->Rsp
   3765 
   3766 	mov	8(%r11),%r10d		# HandlerData[2]
   3767 	lea	(%rsi,%r10),%r10	# epilogue label
   3768 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   3769 	jae	.Lcommon_seh_tail
   3770 
   3771 	lea	.Lmul_epilogue(%rip),%r10
   3772 	cmp	%r10,%rbx
   3773 	ja	.Lbody_40
   3774 
   3775 	mov	192($context),%r10	# pull $num
   3776 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   3777 
   3778 	jmp	.Lcommon_pop_regs
   3779 
   3780 .Lbody_40:
   3781 	mov	40(%rax),%rax		# pull saved stack pointer
   3782 .Lcommon_pop_regs:
   3783 	mov	-8(%rax),%rbx
   3784 	mov	-16(%rax),%rbp
   3785 	mov	-24(%rax),%r12
   3786 	mov	-32(%rax),%r13
   3787 	mov	-40(%rax),%r14
   3788 	mov	-48(%rax),%r15
   3789 	mov	%rbx,144($context)	# restore context->Rbx
   3790 	mov	%rbp,160($context)	# restore context->Rbp
   3791 	mov	%r12,216($context)	# restore context->R12
   3792 	mov	%r13,224($context)	# restore context->R13
   3793 	mov	%r14,232($context)	# restore context->R14
   3794 	mov	%r15,240($context)	# restore context->R15
   3795 
   3796 .Lcommon_seh_tail:
   3797 	mov	8(%rax),%rdi
   3798 	mov	16(%rax),%rsi
   3799 	mov	%rax,152($context)	# restore context->Rsp
   3800 	mov	%rsi,168($context)	# restore context->Rsi
   3801 	mov	%rdi,176($context)	# restore context->Rdi
   3802 
   3803 	mov	40($disp),%rdi		# disp->ContextRecord
   3804 	mov	$context,%rsi		# context
   3805 	mov	\$154,%ecx		# sizeof(CONTEXT)
   3806 	.long	0xa548f3fc		# cld; rep movsq
   3807 
   3808 	mov	$disp,%rsi
   3809 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   3810 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   3811 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   3812 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   3813 	mov	40(%rsi),%r10		# disp->ContextRecord
   3814 	lea	56(%rsi),%r11		# &disp->HandlerData
   3815 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   3816 	mov	%r10,32(%rsp)		# arg5
   3817 	mov	%r11,40(%rsp)		# arg6
   3818 	mov	%r12,48(%rsp)		# arg7
   3819 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   3820 	call	*__imp_RtlVirtualUnwind(%rip)
   3821 
   3822 	mov	\$1,%eax		# ExceptionContinueSearch
   3823 	add	\$64,%rsp
   3824 	popfq
   3825 	pop	%r15
   3826 	pop	%r14
   3827 	pop	%r13
   3828 	pop	%r12
   3829 	pop	%rbp
   3830 	pop	%rbx
   3831 	pop	%rdi
   3832 	pop	%rsi
   3833 	ret
   3834 .size	mul_handler,.-mul_handler
   3835 
   3836 .section	.pdata
   3837 .align	4
   3838 	.rva	.LSEH_begin_bn_mul_mont_gather5
   3839 	.rva	.LSEH_end_bn_mul_mont_gather5
   3840 	.rva	.LSEH_info_bn_mul_mont_gather5
   3841 
   3842 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
   3843 	.rva	.LSEH_end_bn_mul4x_mont_gather5
   3844 	.rva	.LSEH_info_bn_mul4x_mont_gather5
   3845 
   3846 	.rva	.LSEH_begin_bn_power5
   3847 	.rva	.LSEH_end_bn_power5
   3848 	.rva	.LSEH_info_bn_power5
   3849 
   3850 	.rva	.LSEH_begin_bn_from_mont8x
   3851 	.rva	.LSEH_end_bn_from_mont8x
   3852 	.rva	.LSEH_info_bn_from_mont8x
   3853 ___
   3854 $code.=<<___ if ($addx);
   3855 	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
   3856 	.rva	.LSEH_end_bn_mulx4x_mont_gather5
   3857 	.rva	.LSEH_info_bn_mulx4x_mont_gather5
   3858 
   3859 	.rva	.LSEH_begin_bn_powerx5
   3860 	.rva	.LSEH_end_bn_powerx5
   3861 	.rva	.LSEH_info_bn_powerx5
   3862 ___
   3863 $code.=<<___;
   3864 	.rva	.LSEH_begin_bn_gather5
   3865 	.rva	.LSEH_end_bn_gather5
   3866 	.rva	.LSEH_info_bn_gather5
   3867 
   3868 .section	.xdata
   3869 .align	8
   3870 .LSEH_info_bn_mul_mont_gather5:
   3871 	.byte	9,0,0,0
   3872 	.rva	mul_handler
   3873 	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
   3874 .align	8
   3875 .LSEH_info_bn_mul4x_mont_gather5:
   3876 	.byte	9,0,0,0
   3877 	.rva	mul_handler
   3878 	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
   3879 .align	8
   3880 .LSEH_info_bn_power5:
   3881 	.byte	9,0,0,0
   3882 	.rva	mul_handler
   3883 	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
   3884 .align	8
   3885 .LSEH_info_bn_from_mont8x:
   3886 	.byte	9,0,0,0
   3887 	.rva	mul_handler
   3888 	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
   3889 ___
   3890 $code.=<<___ if ($addx);
   3891 .align	8
   3892 .LSEH_info_bn_mulx4x_mont_gather5:
   3893 	.byte	9,0,0,0
   3894 	.rva	mul_handler
   3895 	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
   3896 .align	8
   3897 .LSEH_info_bn_powerx5:
   3898 	.byte	9,0,0,0
   3899 	.rva	mul_handler
   3900 	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
   3901 ___
   3902 $code.=<<___;
   3903 .align	8
   3904 .LSEH_info_bn_gather5:
   3905 	.byte	0x01,0x0b,0x03,0x0a
   3906 	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
   3907 	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
   3908 .align	8
   3909 ___
   3910 }
   3911 
   3912 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3913 
   3914 print $code;
   3915 close STDOUT;
   3916