Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # October 2005.
     11 #
     12 # Montgomery multiplication routine for x86_64. While it gives modest
     13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
     14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
     15 # respectful 50%. It remains to be seen if loop unrolling and
     16 # dedicated squaring routine can provide further improvement...
     17 
     18 # July 2011.
     19 #
     20 # Add dedicated squaring procedure. Performance improvement varies
     21 # from platform to platform, but in average it's ~5%/15%/25%/33%
     22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     23 
     24 # August 2011.
     25 #
     26 # Unroll and modulo-schedule inner loops in such manner that they
     27 # are "fallen through" for input lengths of 8, which is critical for
     28 # 1024-bit RSA *sign*. Average performance improvement in comparison
     29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
     30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     31 
     32 $flavour = shift;
     33 $output  = shift;
     34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     35 
     36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     37 
     38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     41 die "can't locate x86_64-xlate.pl";
     42 
     43 open OUT,"| \"$^X\" $xlate $flavour $output";
     44 *STDOUT=*OUT;
     45 
     46 # int bn_mul_mont(
     47 $rp="%rdi";	# BN_ULONG *rp,
     48 $ap="%rsi";	# const BN_ULONG *ap,
     49 $bp="%rdx";	# const BN_ULONG *bp,
     50 $np="%rcx";	# const BN_ULONG *np,
     51 $n0="%r8";	# const BN_ULONG *n0,
     52 $num="%r9";	# int num);
     53 $lo0="%r10";
     54 $hi0="%r11";
     55 $hi1="%r13";
     56 $i="%r14";
     57 $j="%r15";
     58 $m0="%rbx";
     59 $m1="%rbp";
     60 
     61 $code=<<___;
     62 .text
     63 
     64 .globl	bn_mul_mont
     65 .type	bn_mul_mont,\@function,6
     66 .align	16
     67 bn_mul_mont:
     68 	test	\$3,${num}d
     69 	jnz	.Lmul_enter
     70 	cmp	\$8,${num}d
     71 	jb	.Lmul_enter
     72 	cmp	$ap,$bp
     73 	jne	.Lmul4x_enter
     74 	jmp	.Lsqr4x_enter
     75 
     76 .align	16
     77 .Lmul_enter:
     78 	push	%rbx
     79 	push	%rbp
     80 	push	%r12
     81 	push	%r13
     82 	push	%r14
     83 	push	%r15
     84 
     85 	mov	${num}d,${num}d
     86 	lea	2($num),%r10
     87 	mov	%rsp,%r11
     88 	neg	%r10
     89 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
     90 	and	\$-1024,%rsp		# minimize TLB usage
     91 
     92 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
     93 .Lmul_body:
     94 	mov	$bp,%r12		# reassign $bp
     95 ___
     96 		$bp="%r12";
     97 $code.=<<___;
     98 	mov	($n0),$n0		# pull n0[0] value
     99 	mov	($bp),$m0		# m0=bp[0]
    100 	mov	($ap),%rax
    101 
    102 	xor	$i,$i			# i=0
    103 	xor	$j,$j			# j=0
    104 
    105 	mov	$n0,$m1
    106 	mulq	$m0			# ap[0]*bp[0]
    107 	mov	%rax,$lo0
    108 	mov	($np),%rax
    109 
    110 	imulq	$lo0,$m1		# "tp[0]"*n0
    111 	mov	%rdx,$hi0
    112 
    113 	mulq	$m1			# np[0]*m1
    114 	add	%rax,$lo0		# discarded
    115 	mov	8($ap),%rax
    116 	adc	\$0,%rdx
    117 	mov	%rdx,$hi1
    118 
    119 	lea	1($j),$j		# j++
    120 	jmp	.L1st_enter
    121 
    122 .align	16
    123 .L1st:
    124 	add	%rax,$hi1
    125 	mov	($ap,$j,8),%rax
    126 	adc	\$0,%rdx
    127 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    128 	mov	$lo0,$hi0
    129 	adc	\$0,%rdx
    130 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    131 	mov	%rdx,$hi1
    132 
    133 .L1st_enter:
    134 	mulq	$m0			# ap[j]*bp[0]
    135 	add	%rax,$hi0
    136 	mov	($np,$j,8),%rax
    137 	adc	\$0,%rdx
    138 	lea	1($j),$j		# j++
    139 	mov	%rdx,$lo0
    140 
    141 	mulq	$m1			# np[j]*m1
    142 	cmp	$num,$j
    143 	jne	.L1st
    144 
    145 	add	%rax,$hi1
    146 	mov	($ap),%rax		# ap[0]
    147 	adc	\$0,%rdx
    148 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    149 	adc	\$0,%rdx
    150 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    151 	mov	%rdx,$hi1
    152 	mov	$lo0,$hi0
    153 
    154 	xor	%rdx,%rdx
    155 	add	$hi0,$hi1
    156 	adc	\$0,%rdx
    157 	mov	$hi1,-8(%rsp,$num,8)
    158 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    159 
    160 	lea	1($i),$i		# i++
    161 	jmp	.Louter
    162 .align	16
    163 .Louter:
    164 	mov	($bp,$i,8),$m0		# m0=bp[i]
    165 	xor	$j,$j			# j=0
    166 	mov	$n0,$m1
    167 	mov	(%rsp),$lo0
    168 	mulq	$m0			# ap[0]*bp[i]
    169 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    170 	mov	($np),%rax
    171 	adc	\$0,%rdx
    172 
    173 	imulq	$lo0,$m1		# tp[0]*n0
    174 	mov	%rdx,$hi0
    175 
    176 	mulq	$m1			# np[0]*m1
    177 	add	%rax,$lo0		# discarded
    178 	mov	8($ap),%rax
    179 	adc	\$0,%rdx
    180 	mov	8(%rsp),$lo0		# tp[1]
    181 	mov	%rdx,$hi1
    182 
    183 	lea	1($j),$j		# j++
    184 	jmp	.Linner_enter
    185 
    186 .align	16
    187 .Linner:
    188 	add	%rax,$hi1
    189 	mov	($ap,$j,8),%rax
    190 	adc	\$0,%rdx
    191 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    192 	mov	(%rsp,$j,8),$lo0
    193 	adc	\$0,%rdx
    194 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    195 	mov	%rdx,$hi1
    196 
    197 .Linner_enter:
    198 	mulq	$m0			# ap[j]*bp[i]
    199 	add	%rax,$hi0
    200 	mov	($np,$j,8),%rax
    201 	adc	\$0,%rdx
    202 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    203 	mov	%rdx,$hi0
    204 	adc	\$0,$hi0
    205 	lea	1($j),$j		# j++
    206 
    207 	mulq	$m1			# np[j]*m1
    208 	cmp	$num,$j
    209 	jne	.Linner
    210 
    211 	add	%rax,$hi1
    212 	mov	($ap),%rax		# ap[0]
    213 	adc	\$0,%rdx
    214 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    215 	mov	(%rsp,$j,8),$lo0
    216 	adc	\$0,%rdx
    217 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    218 	mov	%rdx,$hi1
    219 
    220 	xor	%rdx,%rdx
    221 	add	$hi0,$hi1
    222 	adc	\$0,%rdx
    223 	add	$lo0,$hi1		# pull upmost overflow bit
    224 	adc	\$0,%rdx
    225 	mov	$hi1,-8(%rsp,$num,8)
    226 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    227 
    228 	lea	1($i),$i		# i++
    229 	cmp	$num,$i
    230 	jl	.Louter
    231 
    232 	xor	$i,$i			# i=0 and clear CF!
    233 	mov	(%rsp),%rax		# tp[0]
    234 	lea	(%rsp),$ap		# borrow ap for tp
    235 	mov	$num,$j			# j=num
    236 	jmp	.Lsub
    237 .align	16
    238 .Lsub:	sbb	($np,$i,8),%rax
    239 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    240 	mov	8($ap,$i,8),%rax	# tp[i+1]
    241 	lea	1($i),$i		# i++
    242 	dec	$j			# doesnn't affect CF!
    243 	jnz	.Lsub
    244 
    245 	sbb	\$0,%rax		# handle upmost overflow bit
    246 	xor	$i,$i
    247 	and	%rax,$ap
    248 	not	%rax
    249 	mov	$rp,$np
    250 	and	%rax,$np
    251 	mov	$num,$j			# j=num
    252 	or	$np,$ap			# ap=borrow?tp:rp
    253 .align	16
    254 .Lcopy:					# copy or in-place refresh
    255 	mov	($ap,$i,8),%rax
    256 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    257 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    258 	lea	1($i),$i
    259 	sub	\$1,$j
    260 	jnz	.Lcopy
    261 
    262 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    263 	mov	\$1,%rax
    264 	mov	(%rsi),%r15
    265 	mov	8(%rsi),%r14
    266 	mov	16(%rsi),%r13
    267 	mov	24(%rsi),%r12
    268 	mov	32(%rsi),%rbp
    269 	mov	40(%rsi),%rbx
    270 	lea	48(%rsi),%rsp
    271 .Lmul_epilogue:
    272 	ret
    273 .size	bn_mul_mont,.-bn_mul_mont
    274 ___
    275 {{{
    276 my @A=("%r10","%r11");
    277 my @N=("%r13","%rdi");
    278 $code.=<<___;
    279 .type	bn_mul4x_mont,\@function,6
    280 .align	16
    281 bn_mul4x_mont:
    282 .Lmul4x_enter:
    283 	push	%rbx
    284 	push	%rbp
    285 	push	%r12
    286 	push	%r13
    287 	push	%r14
    288 	push	%r15
    289 
    290 	mov	${num}d,${num}d
    291 	lea	4($num),%r10
    292 	mov	%rsp,%r11
    293 	neg	%r10
    294 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
    295 	and	\$-1024,%rsp		# minimize TLB usage
    296 
    297 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
    298 .Lmul4x_body:
    299 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    300 	mov	%rdx,%r12		# reassign $bp
    301 ___
    302 		$bp="%r12";
    303 $code.=<<___;
    304 	mov	($n0),$n0		# pull n0[0] value
    305 	mov	($bp),$m0		# m0=bp[0]
    306 	mov	($ap),%rax
    307 
    308 	xor	$i,$i			# i=0
    309 	xor	$j,$j			# j=0
    310 
    311 	mov	$n0,$m1
    312 	mulq	$m0			# ap[0]*bp[0]
    313 	mov	%rax,$A[0]
    314 	mov	($np),%rax
    315 
    316 	imulq	$A[0],$m1		# "tp[0]"*n0
    317 	mov	%rdx,$A[1]
    318 
    319 	mulq	$m1			# np[0]*m1
    320 	add	%rax,$A[0]		# discarded
    321 	mov	8($ap),%rax
    322 	adc	\$0,%rdx
    323 	mov	%rdx,$N[1]
    324 
    325 	mulq	$m0
    326 	add	%rax,$A[1]
    327 	mov	8($np),%rax
    328 	adc	\$0,%rdx
    329 	mov	%rdx,$A[0]
    330 
    331 	mulq	$m1
    332 	add	%rax,$N[1]
    333 	mov	16($ap),%rax
    334 	adc	\$0,%rdx
    335 	add	$A[1],$N[1]
    336 	lea	4($j),$j		# j++
    337 	adc	\$0,%rdx
    338 	mov	$N[1],(%rsp)
    339 	mov	%rdx,$N[0]
    340 	jmp	.L1st4x
    341 .align	16
    342 .L1st4x:
    343 	mulq	$m0			# ap[j]*bp[0]
    344 	add	%rax,$A[0]
    345 	mov	-16($np,$j,8),%rax
    346 	adc	\$0,%rdx
    347 	mov	%rdx,$A[1]
    348 
    349 	mulq	$m1			# np[j]*m1
    350 	add	%rax,$N[0]
    351 	mov	-8($ap,$j,8),%rax
    352 	adc	\$0,%rdx
    353 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    354 	adc	\$0,%rdx
    355 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    356 	mov	%rdx,$N[1]
    357 
    358 	mulq	$m0			# ap[j]*bp[0]
    359 	add	%rax,$A[1]
    360 	mov	-8($np,$j,8),%rax
    361 	adc	\$0,%rdx
    362 	mov	%rdx,$A[0]
    363 
    364 	mulq	$m1			# np[j]*m1
    365 	add	%rax,$N[1]
    366 	mov	($ap,$j,8),%rax
    367 	adc	\$0,%rdx
    368 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    369 	adc	\$0,%rdx
    370 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    371 	mov	%rdx,$N[0]
    372 
    373 	mulq	$m0			# ap[j]*bp[0]
    374 	add	%rax,$A[0]
    375 	mov	($np,$j,8),%rax
    376 	adc	\$0,%rdx
    377 	mov	%rdx,$A[1]
    378 
    379 	mulq	$m1			# np[j]*m1
    380 	add	%rax,$N[0]
    381 	mov	8($ap,$j,8),%rax
    382 	adc	\$0,%rdx
    383 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    384 	adc	\$0,%rdx
    385 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    386 	mov	%rdx,$N[1]
    387 
    388 	mulq	$m0			# ap[j]*bp[0]
    389 	add	%rax,$A[1]
    390 	mov	8($np,$j,8),%rax
    391 	adc	\$0,%rdx
    392 	lea	4($j),$j		# j++
    393 	mov	%rdx,$A[0]
    394 
    395 	mulq	$m1			# np[j]*m1
    396 	add	%rax,$N[1]
    397 	mov	-16($ap,$j,8),%rax
    398 	adc	\$0,%rdx
    399 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    400 	adc	\$0,%rdx
    401 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    402 	mov	%rdx,$N[0]
    403 	cmp	$num,$j
    404 	jl	.L1st4x
    405 
    406 	mulq	$m0			# ap[j]*bp[0]
    407 	add	%rax,$A[0]
    408 	mov	-16($np,$j,8),%rax
    409 	adc	\$0,%rdx
    410 	mov	%rdx,$A[1]
    411 
    412 	mulq	$m1			# np[j]*m1
    413 	add	%rax,$N[0]
    414 	mov	-8($ap,$j,8),%rax
    415 	adc	\$0,%rdx
    416 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    417 	adc	\$0,%rdx
    418 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    419 	mov	%rdx,$N[1]
    420 
    421 	mulq	$m0			# ap[j]*bp[0]
    422 	add	%rax,$A[1]
    423 	mov	-8($np,$j,8),%rax
    424 	adc	\$0,%rdx
    425 	mov	%rdx,$A[0]
    426 
    427 	mulq	$m1			# np[j]*m1
    428 	add	%rax,$N[1]
    429 	mov	($ap),%rax		# ap[0]
    430 	adc	\$0,%rdx
    431 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    432 	adc	\$0,%rdx
    433 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    434 	mov	%rdx,$N[0]
    435 
    436 	xor	$N[1],$N[1]
    437 	add	$A[0],$N[0]
    438 	adc	\$0,$N[1]
    439 	mov	$N[0],-8(%rsp,$j,8)
    440 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    441 
    442 	lea	1($i),$i		# i++
    443 .align	4
    444 .Louter4x:
    445 	mov	($bp,$i,8),$m0		# m0=bp[i]
    446 	xor	$j,$j			# j=0
    447 	mov	(%rsp),$A[0]
    448 	mov	$n0,$m1
    449 	mulq	$m0			# ap[0]*bp[i]
    450 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    451 	mov	($np),%rax
    452 	adc	\$0,%rdx
    453 
    454 	imulq	$A[0],$m1		# tp[0]*n0
    455 	mov	%rdx,$A[1]
    456 
    457 	mulq	$m1			# np[0]*m1
    458 	add	%rax,$A[0]		# "$N[0]", discarded
    459 	mov	8($ap),%rax
    460 	adc	\$0,%rdx
    461 	mov	%rdx,$N[1]
    462 
    463 	mulq	$m0			# ap[j]*bp[i]
    464 	add	%rax,$A[1]
    465 	mov	8($np),%rax
    466 	adc	\$0,%rdx
    467 	add	8(%rsp),$A[1]		# +tp[1]
    468 	adc	\$0,%rdx
    469 	mov	%rdx,$A[0]
    470 
    471 	mulq	$m1			# np[j]*m1
    472 	add	%rax,$N[1]
    473 	mov	16($ap),%rax
    474 	adc	\$0,%rdx
    475 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    476 	lea	4($j),$j		# j+=2
    477 	adc	\$0,%rdx
    478 	mov	$N[1],(%rsp)		# tp[j-1]
    479 	mov	%rdx,$N[0]
    480 	jmp	.Linner4x
    481 .align	16
    482 .Linner4x:
    483 	mulq	$m0			# ap[j]*bp[i]
    484 	add	%rax,$A[0]
    485 	mov	-16($np,$j,8),%rax
    486 	adc	\$0,%rdx
    487 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    488 	adc	\$0,%rdx
    489 	mov	%rdx,$A[1]
    490 
    491 	mulq	$m1			# np[j]*m1
    492 	add	%rax,$N[0]
    493 	mov	-8($ap,$j,8),%rax
    494 	adc	\$0,%rdx
    495 	add	$A[0],$N[0]
    496 	adc	\$0,%rdx
    497 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    498 	mov	%rdx,$N[1]
    499 
    500 	mulq	$m0			# ap[j]*bp[i]
    501 	add	%rax,$A[1]
    502 	mov	-8($np,$j,8),%rax
    503 	adc	\$0,%rdx
    504 	add	-8(%rsp,$j,8),$A[1]
    505 	adc	\$0,%rdx
    506 	mov	%rdx,$A[0]
    507 
    508 	mulq	$m1			# np[j]*m1
    509 	add	%rax,$N[1]
    510 	mov	($ap,$j,8),%rax
    511 	adc	\$0,%rdx
    512 	add	$A[1],$N[1]
    513 	adc	\$0,%rdx
    514 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    515 	mov	%rdx,$N[0]
    516 
    517 	mulq	$m0			# ap[j]*bp[i]
    518 	add	%rax,$A[0]
    519 	mov	($np,$j,8),%rax
    520 	adc	\$0,%rdx
    521 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    522 	adc	\$0,%rdx
    523 	mov	%rdx,$A[1]
    524 
    525 	mulq	$m1			# np[j]*m1
    526 	add	%rax,$N[0]
    527 	mov	8($ap,$j,8),%rax
    528 	adc	\$0,%rdx
    529 	add	$A[0],$N[0]
    530 	adc	\$0,%rdx
    531 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    532 	mov	%rdx,$N[1]
    533 
    534 	mulq	$m0			# ap[j]*bp[i]
    535 	add	%rax,$A[1]
    536 	mov	8($np,$j,8),%rax
    537 	adc	\$0,%rdx
    538 	add	8(%rsp,$j,8),$A[1]
    539 	adc	\$0,%rdx
    540 	lea	4($j),$j		# j++
    541 	mov	%rdx,$A[0]
    542 
    543 	mulq	$m1			# np[j]*m1
    544 	add	%rax,$N[1]
    545 	mov	-16($ap,$j,8),%rax
    546 	adc	\$0,%rdx
    547 	add	$A[1],$N[1]
    548 	adc	\$0,%rdx
    549 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    550 	mov	%rdx,$N[0]
    551 	cmp	$num,$j
    552 	jl	.Linner4x
    553 
    554 	mulq	$m0			# ap[j]*bp[i]
    555 	add	%rax,$A[0]
    556 	mov	-16($np,$j,8),%rax
    557 	adc	\$0,%rdx
    558 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    559 	adc	\$0,%rdx
    560 	mov	%rdx,$A[1]
    561 
    562 	mulq	$m1			# np[j]*m1
    563 	add	%rax,$N[0]
    564 	mov	-8($ap,$j,8),%rax
    565 	adc	\$0,%rdx
    566 	add	$A[0],$N[0]
    567 	adc	\$0,%rdx
    568 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    569 	mov	%rdx,$N[1]
    570 
    571 	mulq	$m0			# ap[j]*bp[i]
    572 	add	%rax,$A[1]
    573 	mov	-8($np,$j,8),%rax
    574 	adc	\$0,%rdx
    575 	add	-8(%rsp,$j,8),$A[1]
    576 	adc	\$0,%rdx
    577 	lea	1($i),$i		# i++
    578 	mov	%rdx,$A[0]
    579 
    580 	mulq	$m1			# np[j]*m1
    581 	add	%rax,$N[1]
    582 	mov	($ap),%rax		# ap[0]
    583 	adc	\$0,%rdx
    584 	add	$A[1],$N[1]
    585 	adc	\$0,%rdx
    586 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    587 	mov	%rdx,$N[0]
    588 
    589 	xor	$N[1],$N[1]
    590 	add	$A[0],$N[0]
    591 	adc	\$0,$N[1]
    592 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    593 	adc	\$0,$N[1]
    594 	mov	$N[0],-8(%rsp,$j,8)
    595 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    596 
    597 	cmp	$num,$i
    598 	jl	.Louter4x
    599 ___
    600 {
    601 my @ri=("%rax","%rdx",$m0,$m1);
    602 $code.=<<___;
    603 	mov	16(%rsp,$num,8),$rp	# restore $rp
    604 	mov	0(%rsp),@ri[0]		# tp[0]
    605 	pxor	%xmm0,%xmm0
    606 	mov	8(%rsp),@ri[1]		# tp[1]
    607 	shr	\$2,$num		# num/=4
    608 	lea	(%rsp),$ap		# borrow ap for tp
    609 	xor	$i,$i			# i=0 and clear CF!
    610 
    611 	sub	0($np),@ri[0]
    612 	mov	16($ap),@ri[2]		# tp[2]
    613 	mov	24($ap),@ri[3]		# tp[3]
    614 	sbb	8($np),@ri[1]
    615 	lea	-1($num),$j		# j=num/4-1
    616 	jmp	.Lsub4x
    617 .align	16
    618 .Lsub4x:
    619 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    620 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    621 	sbb	16($np,$i,8),@ri[2]
    622 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    623 	mov	40($ap,$i,8),@ri[1]
    624 	sbb	24($np,$i,8),@ri[3]
    625 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    626 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    627 	sbb	32($np,$i,8),@ri[0]
    628 	mov	48($ap,$i,8),@ri[2]
    629 	mov	56($ap,$i,8),@ri[3]
    630 	sbb	40($np,$i,8),@ri[1]
    631 	lea	4($i),$i		# i++
    632 	dec	$j			# doesnn't affect CF!
    633 	jnz	.Lsub4x
    634 
    635 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    636 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    637 	sbb	16($np,$i,8),@ri[2]
    638 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    639 	sbb	24($np,$i,8),@ri[3]
    640 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    641 
    642 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    643 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    644 	xor	$i,$i			# i=0
    645 	and	@ri[0],$ap
    646 	not	@ri[0]
    647 	mov	$rp,$np
    648 	and	@ri[0],$np
    649 	lea	-1($num),$j
    650 	or	$np,$ap			# ap=borrow?tp:rp
    651 
    652 	movdqu	($ap),%xmm1
    653 	movdqa	%xmm0,(%rsp)
    654 	movdqu	%xmm1,($rp)
    655 	jmp	.Lcopy4x
    656 .align	16
    657 .Lcopy4x:					# copy or in-place refresh
    658 	movdqu	16($ap,$i),%xmm2
    659 	movdqu	32($ap,$i),%xmm1
    660 	movdqa	%xmm0,16(%rsp,$i)
    661 	movdqu	%xmm2,16($rp,$i)
    662 	movdqa	%xmm0,32(%rsp,$i)
    663 	movdqu	%xmm1,32($rp,$i)
    664 	lea	32($i),$i
    665 	dec	$j
    666 	jnz	.Lcopy4x
    667 
    668 	shl	\$2,$num
    669 	movdqu	16($ap,$i),%xmm2
    670 	movdqa	%xmm0,16(%rsp,$i)
    671 	movdqu	%xmm2,16($rp,$i)
    672 ___
    673 }
    674 $code.=<<___;
    675 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    676 	mov	\$1,%rax
    677 	mov	(%rsi),%r15
    678 	mov	8(%rsi),%r14
    679 	mov	16(%rsi),%r13
    680 	mov	24(%rsi),%r12
    681 	mov	32(%rsi),%rbp
    682 	mov	40(%rsi),%rbx
    683 	lea	48(%rsi),%rsp
    684 .Lmul4x_epilogue:
    685 	ret
    686 .size	bn_mul4x_mont,.-bn_mul4x_mont
    687 ___
    688 }}}
    689 {{{
    691 ######################################################################
    692 # void bn_sqr4x_mont(
    693 my $rptr="%rdi";	# const BN_ULONG *rptr,
    694 my $aptr="%rsi";	# const BN_ULONG *aptr,
    695 my $bptr="%rdx";	# not used
    696 my $nptr="%rcx";	# const BN_ULONG *nptr,
    697 my $n0  ="%r8";		# const BN_ULONG *n0);
    698 my $num ="%r9";		# int num, has to be divisible by 4 and
    699 			# not less than 8
    700 
    701 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    702 my @A0=("%r10","%r11");
    703 my @A1=("%r12","%r13");
    704 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    705 
    706 $code.=<<___;
    707 .type	bn_sqr4x_mont,\@function,6
    708 .align	16
    709 bn_sqr4x_mont:
    710 .Lsqr4x_enter:
    711 	push	%rbx
    712 	push	%rbp
    713 	push	%r12
    714 	push	%r13
    715 	push	%r14
    716 	push	%r15
    717 
    718 	shl	\$3,${num}d		# convert $num to bytes
    719 	xor	%r10,%r10
    720 	mov	%rsp,%r11		# put aside %rsp
    721 	sub	$num,%r10		# -$num
    722 	mov	($n0),$n0		# *n0
    723 	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
    724 	and	\$-1024,%rsp		# minimize TLB usage
    725 	##############################################################
    726 	# Stack layout
    727 	#
    728 	# +0	saved $num, used in reduction section
    729 	# +8	&t[2*$num], used in reduction section
    730 	# +32	saved $rptr
    731 	# +40	saved $nptr
    732 	# +48	saved *n0
    733 	# +56	saved %rsp
    734 	# +64	t[2*$num]
    735 	#
    736 	mov	$rptr,32(%rsp)		# save $rptr
    737 	mov	$nptr,40(%rsp)
    738 	mov	$n0,  48(%rsp)
    739 	mov	%r11, 56(%rsp)		# save original %rsp
    740 .Lsqr4x_body:
    741 	##############################################################
    742 	# Squaring part:
    743 	#
    744 	# a) multiply-n-add everything but a[i]*a[i];
    745 	# b) shift result of a) by 1 to the left and accumulate
    746 	#    a[i]*a[i] products;
    747 	#
    748 	lea	32(%r10),$i		# $i=-($num-32)
    749 	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
    750 
    751 	mov	$num,$j			# $j=$num
    752 
    753 					# comments apply to $num==8 case
    754 	mov	-32($aptr,$i),$a0	# a[0]
    755 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    756 	mov	-24($aptr,$i),%rax	# a[1]
    757 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    758 	mov	-16($aptr,$i),$ai	# a[2]
    759 	mov	%rax,$a1
    760 
    761 	mul	$a0			# a[1]*a[0]
    762 	mov	%rax,$A0[0]		# a[1]*a[0]
    763 	 mov	$ai,%rax		# a[2]
    764 	mov	%rdx,$A0[1]
    765 	mov	$A0[0],-24($tptr,$i)	# t[1]
    766 
    767 	xor	$A0[0],$A0[0]
    768 	mul	$a0			# a[2]*a[0]
    769 	add	%rax,$A0[1]
    770 	 mov	$ai,%rax
    771 	adc	%rdx,$A0[0]
    772 	mov	$A0[1],-16($tptr,$i)	# t[2]
    773 
    774 	lea	-16($i),$j		# j=-16
    775 
    776 
    777 	 mov	8($aptr,$j),$ai		# a[3]
    778 	mul	$a1			# a[2]*a[1]
    779 	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
    780 	 mov	$ai,%rax
    781 	mov	%rdx,$A1[1]
    782 
    783 	xor	$A0[1],$A0[1]
    784 	add	$A1[0],$A0[0]
    785 	 lea	16($j),$j
    786 	adc	\$0,$A0[1]
    787 	mul	$a0			# a[3]*a[0]
    788 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
    789 	 mov	$ai,%rax
    790 	adc	%rdx,$A0[1]
    791 	mov	$A0[0],-8($tptr,$j)	# t[3]
    792 	jmp	.Lsqr4x_1st
    793 
    794 .align	16
    795 .Lsqr4x_1st:
    796 	 mov	($aptr,$j),$ai		# a[4]
    797 	xor	$A1[0],$A1[0]
    798 	mul	$a1			# a[3]*a[1]
    799 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
    800 	 mov	$ai,%rax
    801 	adc	%rdx,$A1[0]
    802 
    803 	xor	$A0[0],$A0[0]
    804 	add	$A1[1],$A0[1]
    805 	adc	\$0,$A0[0]
    806 	mul	$a0			# a[4]*a[0]
    807 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
    808 	 mov	$ai,%rax		# a[3]
    809 	adc	%rdx,$A0[0]
    810 	mov	$A0[1],($tptr,$j)	# t[4]
    811 
    812 
    813 	 mov	8($aptr,$j),$ai		# a[5]
    814 	xor	$A1[1],$A1[1]
    815 	mul	$a1			# a[4]*a[3]
    816 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
    817 	 mov	$ai,%rax
    818 	adc	%rdx,$A1[1]
    819 
    820 	xor	$A0[1],$A0[1]
    821 	add	$A1[0],$A0[0]
    822 	adc	\$0,$A0[1]
    823 	mul	$a0			# a[5]*a[2]
    824 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
    825 	 mov	$ai,%rax
    826 	adc	%rdx,$A0[1]
    827 	mov	$A0[0],8($tptr,$j)	# t[5]
    828 
    829 	 mov	16($aptr,$j),$ai	# a[6]
    830 	xor	$A1[0],$A1[0]
    831 	mul	$a1			# a[5]*a[3]
    832 	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
    833 	 mov	$ai,%rax
    834 	adc	%rdx,$A1[0]
    835 
    836 	xor	$A0[0],$A0[0]
    837 	add	$A1[1],$A0[1]
    838 	adc	\$0,$A0[0]
    839 	mul	$a0			# a[6]*a[2]
    840 	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
    841 	 mov	$ai,%rax		# a[3]
    842 	adc	%rdx,$A0[0]
    843 	mov	$A0[1],16($tptr,$j)	# t[6]
    844 
    845 
    846 	 mov	24($aptr,$j),$ai	# a[7]
    847 	xor	$A1[1],$A1[1]
    848 	mul	$a1			# a[6]*a[5]
    849 	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
    850 	 mov	$ai,%rax
    851 	adc	%rdx,$A1[1]
    852 
    853 	xor	$A0[1],$A0[1]
    854 	add	$A1[0],$A0[0]
    855 	 lea	32($j),$j
    856 	adc	\$0,$A0[1]
    857 	mul	$a0			# a[7]*a[4]
    858 	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
    859 	 mov	$ai,%rax
    860 	adc	%rdx,$A0[1]
    861 	mov	$A0[0],-8($tptr,$j)	# t[7]
    862 
    863 	cmp	\$0,$j
    864 	jne	.Lsqr4x_1st
    865 
    866 	xor	$A1[0],$A1[0]
    867 	add	$A0[1],$A1[1]
    868 	adc	\$0,$A1[0]
    869 	mul	$a1			# a[7]*a[5]
    870 	add	%rax,$A1[1]
    871 	adc	%rdx,$A1[0]
    872 
    873 	mov	$A1[1],($tptr)		# t[8]
    874 	lea	16($i),$i
    875 	mov	$A1[0],8($tptr)		# t[9]
    876 	jmp	.Lsqr4x_outer
    877 
    878 .align	16
    879 .Lsqr4x_outer:				# comments apply to $num==6 case
    880 	mov	-32($aptr,$i),$a0	# a[0]
    881 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    882 	mov	-24($aptr,$i),%rax	# a[1]
    883 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    884 	mov	-16($aptr,$i),$ai	# a[2]
    885 	mov	%rax,$a1
    886 
    887 	mov	-24($tptr,$i),$A0[0]	# t[1]
    888 	xor	$A0[1],$A0[1]
    889 	mul	$a0			# a[1]*a[0]
    890 	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
    891 	 mov	$ai,%rax		# a[2]
    892 	adc	%rdx,$A0[1]
    893 	mov	$A0[0],-24($tptr,$i)	# t[1]
    894 
    895 	xor	$A0[0],$A0[0]
    896 	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
    897 	adc	\$0,$A0[0]
    898 	mul	$a0			# a[2]*a[0]
    899 	add	%rax,$A0[1]
    900 	 mov	$ai,%rax
    901 	adc	%rdx,$A0[0]
    902 	mov	$A0[1],-16($tptr,$i)	# t[2]
    903 
    904 	lea	-16($i),$j		# j=-16
    905 	xor	$A1[0],$A1[0]
    906 
    907 
    908 	 mov	8($aptr,$j),$ai		# a[3]
    909 	xor	$A1[1],$A1[1]
    910 	add	8($tptr,$j),$A1[0]
    911 	adc	\$0,$A1[1]
    912 	mul	$a1			# a[2]*a[1]
    913 	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
    914 	 mov	$ai,%rax
    915 	adc	%rdx,$A1[1]
    916 
    917 	xor	$A0[1],$A0[1]
    918 	add	$A1[0],$A0[0]
    919 	adc	\$0,$A0[1]
    920 	mul	$a0			# a[3]*a[0]
    921 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
    922 	 mov	$ai,%rax
    923 	adc	%rdx,$A0[1]
    924 	mov	$A0[0],8($tptr,$j)	# t[3]
    925 
    926 	lea	16($j),$j
    927 	jmp	.Lsqr4x_inner
    928 
    929 .align	16
    930 .Lsqr4x_inner:
    931 	 mov	($aptr,$j),$ai		# a[4]
    932 	xor	$A1[0],$A1[0]
    933 	add	($tptr,$j),$A1[1]
    934 	adc	\$0,$A1[0]
    935 	mul	$a1			# a[3]*a[1]
    936 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
    937 	 mov	$ai,%rax
    938 	adc	%rdx,$A1[0]
    939 
    940 	xor	$A0[0],$A0[0]
    941 	add	$A1[1],$A0[1]
    942 	adc	\$0,$A0[0]
    943 	mul	$a0			# a[4]*a[0]
    944 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
    945 	 mov	$ai,%rax		# a[3]
    946 	adc	%rdx,$A0[0]
    947 	mov	$A0[1],($tptr,$j)	# t[4]
    948 
    949 	 mov	8($aptr,$j),$ai		# a[5]
    950 	xor	$A1[1],$A1[1]
    951 	add	8($tptr,$j),$A1[0]
    952 	adc	\$0,$A1[1]
    953 	mul	$a1			# a[4]*a[3]
    954 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
    955 	 mov	$ai,%rax
    956 	adc	%rdx,$A1[1]
    957 
    958 	xor	$A0[1],$A0[1]
    959 	add	$A1[0],$A0[0]
    960 	lea	16($j),$j		# j++
    961 	adc	\$0,$A0[1]
    962 	mul	$a0			# a[5]*a[2]
    963 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
    964 	 mov	$ai,%rax
    965 	adc	%rdx,$A0[1]
    966 	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
    967 
    968 	cmp	\$0,$j
    969 	jne	.Lsqr4x_inner
    970 
    971 	xor	$A1[0],$A1[0]
    972 	add	$A0[1],$A1[1]
    973 	adc	\$0,$A1[0]
    974 	mul	$a1			# a[5]*a[3]
    975 	add	%rax,$A1[1]
    976 	adc	%rdx,$A1[0]
    977 
    978 	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
    979 	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
    980 
    981 	add	\$16,$i
    982 	jnz	.Lsqr4x_outer
    983 
    984 					# comments apply to $num==4 case
    985 	mov	-32($aptr),$a0		# a[0]
    986 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    987 	mov	-24($aptr),%rax		# a[1]
    988 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    989 	mov	-16($aptr),$ai		# a[2]
    990 	mov	%rax,$a1
    991 
    992 	xor	$A0[1],$A0[1]
    993 	mul	$a0			# a[1]*a[0]
    994 	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
    995 	 mov	$ai,%rax		# a[2]
    996 	adc	%rdx,$A0[1]
    997 	mov	$A0[0],-24($tptr)	# t[1]
    998 
    999 	xor	$A0[0],$A0[0]
   1000 	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
   1001 	adc	\$0,$A0[0]
   1002 	mul	$a0			# a[2]*a[0]
   1003 	add	%rax,$A0[1]
   1004 	 mov	$ai,%rax
   1005 	adc	%rdx,$A0[0]
   1006 	mov	$A0[1],-16($tptr)	# t[2]
   1007 
   1008 	 mov	-8($aptr),$ai		# a[3]
   1009 	mul	$a1			# a[2]*a[1]
   1010 	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
   1011 	 mov	$ai,%rax
   1012 	adc	\$0,%rdx
   1013 
   1014 	xor	$A0[1],$A0[1]
   1015 	add	$A1[0],$A0[0]
   1016 	 mov	%rdx,$A1[1]
   1017 	adc	\$0,$A0[1]
   1018 	mul	$a0			# a[3]*a[0]
   1019 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1020 	 mov	$ai,%rax
   1021 	adc	%rdx,$A0[1]
   1022 	mov	$A0[0],-8($tptr)	# t[3]
   1023 
   1024 	xor	$A1[0],$A1[0]
   1025 	add	$A0[1],$A1[1]
   1026 	adc	\$0,$A1[0]
   1027 	mul	$a1			# a[3]*a[1]
   1028 	add	%rax,$A1[1]
   1029 	 mov	-16($aptr),%rax		# a[2]
   1030 	adc	%rdx,$A1[0]
   1031 
   1032 	mov	$A1[1],($tptr)		# t[4]
   1033 	mov	$A1[0],8($tptr)		# t[5]
   1034 
   1035 	mul	$ai			# a[2]*a[3]
   1036 ___
   1037 {
   1038 my ($shift,$carry)=($a0,$a1);
   1039 my @S=(@A1,$ai,$n0);
   1040 $code.=<<___;
   1041 	 add	\$16,$i
   1042 	 xor	$shift,$shift
   1043 	 sub	$num,$i			# $i=16-$num
   1044 	 xor	$carry,$carry
   1045 
   1046 	add	$A1[0],%rax		# t[5]
   1047 	adc	\$0,%rdx
   1048 	mov	%rax,8($tptr)		# t[5]
   1049 	mov	%rdx,16($tptr)		# t[6]
   1050 	mov	$carry,24($tptr)	# t[7]
   1051 
   1052 	 mov	-16($aptr,$i),%rax	# a[0]
   1053 	lea	64(%rsp,$num,2),$tptr
   1054 	 xor	$A0[0],$A0[0]		# t[0]
   1055 	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
   1056 
   1057 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1058 	shr	\$63,$A0[0]
   1059 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1060 	shr	\$63,$A0[1]
   1061 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1062 	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1063 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1064 	mul	%rax			# a[i]*a[i]
   1065 	neg	$carry			# mov $carry,cf
   1066 	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1067 	adc	%rax,$S[0]
   1068 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1069 	mov	$S[0],-32($tptr,$i,2)
   1070 	adc	%rdx,$S[1]
   1071 
   1072 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1073 	 mov	$S[1],-24($tptr,$i,2)
   1074 	 sbb	$carry,$carry		# mov cf,$carry
   1075 	shr	\$63,$A0[0]
   1076 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1077 	shr	\$63,$A0[1]
   1078 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1079 	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1080 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1081 	mul	%rax			# a[i]*a[i]
   1082 	neg	$carry			# mov $carry,cf
   1083 	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1084 	adc	%rax,$S[2]
   1085 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1086 	mov	$S[2],-16($tptr,$i,2)
   1087 	adc	%rdx,$S[3]
   1088 	lea	16($i),$i
   1089 	mov	$S[3],-40($tptr,$i,2)
   1090 	sbb	$carry,$carry		# mov cf,$carry
   1091 	jmp	.Lsqr4x_shift_n_add
   1092 
   1093 .align	16
   1094 .Lsqr4x_shift_n_add:
   1095 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1096 	shr	\$63,$A0[0]
   1097 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1098 	shr	\$63,$A0[1]
   1099 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1100 	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1101 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1102 	mul	%rax			# a[i]*a[i]
   1103 	neg	$carry			# mov $carry,cf
   1104 	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1105 	adc	%rax,$S[0]
   1106 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1107 	mov	$S[0],-32($tptr,$i,2)
   1108 	adc	%rdx,$S[1]
   1109 
   1110 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1111 	 mov	$S[1],-24($tptr,$i,2)
   1112 	 sbb	$carry,$carry		# mov cf,$carry
   1113 	shr	\$63,$A0[0]
   1114 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1115 	shr	\$63,$A0[1]
   1116 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1117 	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1118 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1119 	mul	%rax			# a[i]*a[i]
   1120 	neg	$carry			# mov $carry,cf
   1121 	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1122 	adc	%rax,$S[2]
   1123 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1124 	mov	$S[2],-16($tptr,$i,2)
   1125 	adc	%rdx,$S[3]
   1126 
   1127 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1128 	 mov	$S[3],-8($tptr,$i,2)
   1129 	 sbb	$carry,$carry		# mov cf,$carry
   1130 	shr	\$63,$A0[0]
   1131 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1132 	shr	\$63,$A0[1]
   1133 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1134 	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1135 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1136 	mul	%rax			# a[i]*a[i]
   1137 	neg	$carry			# mov $carry,cf
   1138 	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1139 	adc	%rax,$S[0]
   1140 	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
   1141 	mov	$S[0],0($tptr,$i,2)
   1142 	adc	%rdx,$S[1]
   1143 
   1144 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1145 	 mov	$S[1],8($tptr,$i,2)
   1146 	 sbb	$carry,$carry		# mov cf,$carry
   1147 	shr	\$63,$A0[0]
   1148 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1149 	shr	\$63,$A0[1]
   1150 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1151 	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1152 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1153 	mul	%rax			# a[i]*a[i]
   1154 	neg	$carry			# mov $carry,cf
   1155 	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1156 	adc	%rax,$S[2]
   1157 	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
   1158 	mov	$S[2],16($tptr,$i,2)
   1159 	adc	%rdx,$S[3]
   1160 	mov	$S[3],24($tptr,$i,2)
   1161 	sbb	$carry,$carry		# mov cf,$carry
   1162 	add	\$32,$i
   1163 	jnz	.Lsqr4x_shift_n_add
   1164 
   1165 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1166 	shr	\$63,$A0[0]
   1167 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1168 	shr	\$63,$A0[1]
   1169 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1170 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1171 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1172 	mul	%rax			# a[i]*a[i]
   1173 	neg	$carry			# mov $carry,cf
   1174 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1175 	adc	%rax,$S[0]
   1176 	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
   1177 	mov	$S[0],-32($tptr)
   1178 	adc	%rdx,$S[1]
   1179 
   1180 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
   1181 	 mov	$S[1],-24($tptr)
   1182 	 sbb	$carry,$carry		# mov cf,$carry
   1183 	shr	\$63,$A0[0]
   1184 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1185 	shr	\$63,$A0[1]
   1186 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1187 	mul	%rax			# a[i]*a[i]
   1188 	neg	$carry			# mov $carry,cf
   1189 	adc	%rax,$S[2]
   1190 	adc	%rdx,$S[3]
   1191 	mov	$S[2],-16($tptr)
   1192 	mov	$S[3],-8($tptr)
   1193 ___
   1194 }
   1196 ##############################################################
   1197 # Montgomery reduction part, "word-by-word" algorithm.
   1198 #
   1199 {
   1200 my ($topbit,$nptr)=("%rbp",$aptr);
   1201 my ($m0,$m1)=($a0,$a1);
   1202 my @Ni=("%rbx","%r9");
   1203 $code.=<<___;
   1204 	mov	40(%rsp),$nptr		# restore $nptr
   1205 	mov	48(%rsp),$n0		# restore *n0
   1206 	xor	$j,$j
   1207 	mov	$num,0(%rsp)		# save $num
   1208 	sub	$num,$j			# $j=-$num
   1209 	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
   1210 	 mov	$n0,$m0			#		# modsched #
   1211 	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
   1212 	lea	64(%rsp,$num),$tptr	# end of t[] window
   1213 	mov	%rax,8(%rsp)		# save end of t[] buffer
   1214 	lea	($nptr,$num),$nptr	# end of n[] buffer
   1215 	xor	$topbit,$topbit		# $topbit=0
   1216 
   1217 	mov	0($nptr,$j),%rax	# n[0]		# modsched #
   1218 	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
   1219 	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
   1220 	 mov	%rax,$Ni[0]		#		# modsched #
   1221 	jmp	.Lsqr4x_mont_outer
   1222 
   1223 .align	16
   1224 .Lsqr4x_mont_outer:
   1225 	xor	$A0[1],$A0[1]
   1226 	mul	$m0			# n[0]*m0
   1227 	add	%rax,$A0[0]		# n[0]*m0+t[0]
   1228 	 mov	$Ni[1],%rax
   1229 	adc	%rdx,$A0[1]
   1230 	mov	$n0,$m1
   1231 
   1232 	xor	$A0[0],$A0[0]
   1233 	add	8($tptr,$j),$A0[1]
   1234 	adc	\$0,$A0[0]
   1235 	mul	$m0			# n[1]*m0
   1236 	add	%rax,$A0[1]		# n[1]*m0+t[1]
   1237 	 mov	$Ni[0],%rax
   1238 	adc	%rdx,$A0[0]
   1239 
   1240 	imulq	$A0[1],$m1
   1241 
   1242 	mov	16($nptr,$j),$Ni[0]	# n[2]
   1243 	xor	$A1[1],$A1[1]
   1244 	add	$A0[1],$A1[0]
   1245 	adc	\$0,$A1[1]
   1246 	mul	$m1			# n[0]*m1
   1247 	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
   1248 	 mov	$Ni[0],%rax
   1249 	adc	%rdx,$A1[1]
   1250 	mov	$A1[0],8($tptr,$j)	# "t[1]"
   1251 
   1252 	xor	$A0[1],$A0[1]
   1253 	add	16($tptr,$j),$A0[0]
   1254 	adc	\$0,$A0[1]
   1255 	mul	$m0			# n[2]*m0
   1256 	add	%rax,$A0[0]		# n[2]*m0+t[2]
   1257 	 mov	$Ni[1],%rax
   1258 	adc	%rdx,$A0[1]
   1259 
   1260 	mov	24($nptr,$j),$Ni[1]	# n[3]
   1261 	xor	$A1[0],$A1[0]
   1262 	add	$A0[0],$A1[1]
   1263 	adc	\$0,$A1[0]
   1264 	mul	$m1			# n[1]*m1
   1265 	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
   1266 	 mov	$Ni[1],%rax
   1267 	adc	%rdx,$A1[0]
   1268 	mov	$A1[1],16($tptr,$j)	# "t[2]"
   1269 
   1270 	xor	$A0[0],$A0[0]
   1271 	add	24($tptr,$j),$A0[1]
   1272 	lea	32($j),$j
   1273 	adc	\$0,$A0[0]
   1274 	mul	$m0			# n[3]*m0
   1275 	add	%rax,$A0[1]		# n[3]*m0+t[3]
   1276 	 mov	$Ni[0],%rax
   1277 	adc	%rdx,$A0[0]
   1278 	jmp	.Lsqr4x_mont_inner
   1279 
   1280 .align	16
   1281 .Lsqr4x_mont_inner:
   1282 	mov	($nptr,$j),$Ni[0]	# n[4]
   1283 	xor	$A1[1],$A1[1]
   1284 	add	$A0[1],$A1[0]
   1285 	adc	\$0,$A1[1]
   1286 	mul	$m1			# n[2]*m1
   1287 	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
   1288 	 mov	$Ni[0],%rax
   1289 	adc	%rdx,$A1[1]
   1290 	mov	$A1[0],-8($tptr,$j)	# "t[3]"
   1291 
   1292 	xor	$A0[1],$A0[1]
   1293 	add	($tptr,$j),$A0[0]
   1294 	adc	\$0,$A0[1]
   1295 	mul	$m0			# n[4]*m0
   1296 	add	%rax,$A0[0]		# n[4]*m0+t[4]
   1297 	 mov	$Ni[1],%rax
   1298 	adc	%rdx,$A0[1]
   1299 
   1300 	mov	8($nptr,$j),$Ni[1]	# n[5]
   1301 	xor	$A1[0],$A1[0]
   1302 	add	$A0[0],$A1[1]
   1303 	adc	\$0,$A1[0]
   1304 	mul	$m1			# n[3]*m1
   1305 	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
   1306 	 mov	$Ni[1],%rax
   1307 	adc	%rdx,$A1[0]
   1308 	mov	$A1[1],($tptr,$j)	# "t[4]"
   1309 
   1310 	xor	$A0[0],$A0[0]
   1311 	add	8($tptr,$j),$A0[1]
   1312 	adc	\$0,$A0[0]
   1313 	mul	$m0			# n[5]*m0
   1314 	add	%rax,$A0[1]		# n[5]*m0+t[5]
   1315 	 mov	$Ni[0],%rax
   1316 	adc	%rdx,$A0[0]
   1317 
   1318 
   1319 	mov	16($nptr,$j),$Ni[0]	# n[6]
   1320 	xor	$A1[1],$A1[1]
   1321 	add	$A0[1],$A1[0]
   1322 	adc	\$0,$A1[1]
   1323 	mul	$m1			# n[4]*m1
   1324 	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
   1325 	 mov	$Ni[0],%rax
   1326 	adc	%rdx,$A1[1]
   1327 	mov	$A1[0],8($tptr,$j)	# "t[5]"
   1328 
   1329 	xor	$A0[1],$A0[1]
   1330 	add	16($tptr,$j),$A0[0]
   1331 	adc	\$0,$A0[1]
   1332 	mul	$m0			# n[6]*m0
   1333 	add	%rax,$A0[0]		# n[6]*m0+t[6]
   1334 	 mov	$Ni[1],%rax
   1335 	adc	%rdx,$A0[1]
   1336 
   1337 	mov	24($nptr,$j),$Ni[1]	# n[7]
   1338 	xor	$A1[0],$A1[0]
   1339 	add	$A0[0],$A1[1]
   1340 	adc	\$0,$A1[0]
   1341 	mul	$m1			# n[5]*m1
   1342 	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
   1343 	 mov	$Ni[1],%rax
   1344 	adc	%rdx,$A1[0]
   1345 	mov	$A1[1],16($tptr,$j)	# "t[6]"
   1346 
   1347 	xor	$A0[0],$A0[0]
   1348 	add	24($tptr,$j),$A0[1]
   1349 	lea	32($j),$j
   1350 	adc	\$0,$A0[0]
   1351 	mul	$m0			# n[7]*m0
   1352 	add	%rax,$A0[1]		# n[7]*m0+t[7]
   1353 	 mov	$Ni[0],%rax
   1354 	adc	%rdx,$A0[0]
   1355 	cmp	\$0,$j
   1356 	jne	.Lsqr4x_mont_inner
   1357 
   1358 	 sub	0(%rsp),$j		# $j=-$num	# modsched #
   1359 	 mov	$n0,$m0			#		# modsched #
   1360 
   1361 	xor	$A1[1],$A1[1]
   1362 	add	$A0[1],$A1[0]
   1363 	adc	\$0,$A1[1]
   1364 	mul	$m1			# n[6]*m1
   1365 	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
   1366 	mov	$Ni[1],%rax
   1367 	adc	%rdx,$A1[1]
   1368 	mov	$A1[0],-8($tptr)	# "t[7]"
   1369 
   1370 	xor	$A0[1],$A0[1]
   1371 	add	($tptr),$A0[0]		# +t[8]
   1372 	adc	\$0,$A0[1]
   1373 	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
   1374 	add	$topbit,$A0[0]
   1375 	adc	\$0,$A0[1]
   1376 
   1377 	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
   1378 	xor	$A1[0],$A1[0]
   1379 	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
   1380 	add	$A0[0],$A1[1]
   1381 	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
   1382 	adc	\$0,$A1[0]
   1383 	mul	$m1			# n[7]*m1
   1384 	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
   1385 	 mov	$Ni[0],%rax		#		# modsched #
   1386 	adc	%rdx,$A1[0]
   1387 	mov	$A1[1],($tptr)		# "t[8]"
   1388 
   1389 	xor	$topbit,$topbit
   1390 	add	8($tptr),$A1[0]		# +t[9]
   1391 	adc	$topbit,$topbit
   1392 	add	$A0[1],$A1[0]
   1393 	lea	16($tptr),$tptr		# "t[$num]>>128"
   1394 	adc	\$0,$topbit
   1395 	mov	$A1[0],-8($tptr)	# "t[9]"
   1396 	cmp	8(%rsp),$tptr		# are we done?
   1397 	jb	.Lsqr4x_mont_outer
   1398 
   1399 	mov	0(%rsp),$num		# restore $num
   1400 	mov	$topbit,($tptr)		# save $topbit
   1401 ___
   1402 }
   1404 ##############################################################
   1405 # Post-condition, 4x unrolled copy from bn_mul_mont
   1406 #
   1407 {
   1408 my ($tptr,$nptr)=("%rbx",$aptr);
   1409 my @ri=("%rax","%rdx","%r10","%r11");
   1410 $code.=<<___;
   1411 	mov	64(%rsp,$num),@ri[0]	# tp[0]
   1412 	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
   1413 	mov	40(%rsp),$nptr		# restore $nptr
   1414 	shr	\$5,$num		# num/4
   1415 	mov	8($tptr),@ri[1]		# t[1]
   1416 	xor	$i,$i			# i=0 and clear CF!
   1417 
   1418 	mov	32(%rsp),$rptr		# restore $rptr
   1419 	sub	0($nptr),@ri[0]
   1420 	mov	16($tptr),@ri[2]	# t[2]
   1421 	mov	24($tptr),@ri[3]	# t[3]
   1422 	sbb	8($nptr),@ri[1]
   1423 	lea	-1($num),$j		# j=num/4-1
   1424 	jmp	.Lsqr4x_sub
   1425 .align	16
   1426 .Lsqr4x_sub:
   1427 	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1428 	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1429 	sbb	16($nptr,$i,8),@ri[2]
   1430 	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
   1431 	mov	40($tptr,$i,8),@ri[1]
   1432 	sbb	24($nptr,$i,8),@ri[3]
   1433 	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1434 	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1435 	sbb	32($nptr,$i,8),@ri[0]
   1436 	mov	48($tptr,$i,8),@ri[2]
   1437 	mov	56($tptr,$i,8),@ri[3]
   1438 	sbb	40($nptr,$i,8),@ri[1]
   1439 	lea	4($i),$i		# i++
   1440 	dec	$j			# doesn't affect CF!
   1441 	jnz	.Lsqr4x_sub
   1442 
   1443 	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1444 	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
   1445 	sbb	16($nptr,$i,8),@ri[2]
   1446 	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1447 	sbb	24($nptr,$i,8),@ri[3]
   1448 	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1449 
   1450 	sbb	\$0,@ri[0]		# handle upmost overflow bit
   1451 	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1452 	xor	$i,$i			# i=0
   1453 	and	@ri[0],$tptr
   1454 	not	@ri[0]
   1455 	mov	$rptr,$nptr
   1456 	and	@ri[0],$nptr
   1457 	lea	-1($num),$j
   1458 	or	$nptr,$tptr		# tp=borrow?tp:rp
   1459 
   1460 	pxor	%xmm0,%xmm0
   1461 	lea	64(%rsp,$num,8),$nptr
   1462 	movdqu	($tptr),%xmm1
   1463 	lea	($nptr,$num,8),$nptr
   1464 	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
   1465 	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
   1466 	movdqu	%xmm1,($rptr)
   1467 	jmp	.Lsqr4x_copy
   1468 .align	16
   1469 .Lsqr4x_copy:				# copy or in-place refresh
   1470 	movdqu	16($tptr,$i),%xmm2
   1471 	movdqu	32($tptr,$i),%xmm1
   1472 	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
   1473 	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
   1474 	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
   1475 	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
   1476 	movdqu	%xmm2,16($rptr,$i)
   1477 	movdqu	%xmm1,32($rptr,$i)
   1478 	lea	32($i),$i
   1479 	dec	$j
   1480 	jnz	.Lsqr4x_copy
   1481 
   1482 	movdqu	16($tptr,$i),%xmm2
   1483 	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
   1484 	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
   1485 	movdqu	%xmm2,16($rptr,$i)
   1486 ___
   1487 }
   1488 $code.=<<___;
   1489 	mov	56(%rsp),%rsi		# restore %rsp
   1490 	mov	\$1,%rax
   1491 	mov	0(%rsi),%r15
   1492 	mov	8(%rsi),%r14
   1493 	mov	16(%rsi),%r13
   1494 	mov	24(%rsi),%r12
   1495 	mov	32(%rsi),%rbp
   1496 	mov	40(%rsi),%rbx
   1497 	lea	48(%rsi),%rsp
   1498 .Lsqr4x_epilogue:
   1499 	ret
   1500 .size	bn_sqr4x_mont,.-bn_sqr4x_mont
   1501 ___
   1502 }}}
   1503 $code.=<<___;
   1504 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1505 .align	16
   1506 ___
   1507 
   1508 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1509 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1510 if ($win64) {
   1511 $rec="%rcx";
   1512 $frame="%rdx";
   1513 $context="%r8";
   1514 $disp="%r9";
   1515 
   1516 $code.=<<___;
   1517 .extern	__imp_RtlVirtualUnwind
   1518 .type	mul_handler,\@abi-omnipotent
   1519 .align	16
   1520 mul_handler:
   1521 	push	%rsi
   1522 	push	%rdi
   1523 	push	%rbx
   1524 	push	%rbp
   1525 	push	%r12
   1526 	push	%r13
   1527 	push	%r14
   1528 	push	%r15
   1529 	pushfq
   1530 	sub	\$64,%rsp
   1531 
   1532 	mov	120($context),%rax	# pull context->Rax
   1533 	mov	248($context),%rbx	# pull context->Rip
   1534 
   1535 	mov	8($disp),%rsi		# disp->ImageBase
   1536 	mov	56($disp),%r11		# disp->HandlerData
   1537 
   1538 	mov	0(%r11),%r10d		# HandlerData[0]
   1539 	lea	(%rsi,%r10),%r10	# end of prologue label
   1540 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   1541 	jb	.Lcommon_seh_tail
   1542 
   1543 	mov	152($context),%rax	# pull context->Rsp
   1544 
   1545 	mov	4(%r11),%r10d		# HandlerData[1]
   1546 	lea	(%rsi,%r10),%r10	# epilogue label
   1547 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1548 	jae	.Lcommon_seh_tail
   1549 
   1550 	mov	192($context),%r10	# pull $num
   1551 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   1552 	lea	48(%rax),%rax
   1553 
   1554 	mov	-8(%rax),%rbx
   1555 	mov	-16(%rax),%rbp
   1556 	mov	-24(%rax),%r12
   1557 	mov	-32(%rax),%r13
   1558 	mov	-40(%rax),%r14
   1559 	mov	-48(%rax),%r15
   1560 	mov	%rbx,144($context)	# restore context->Rbx
   1561 	mov	%rbp,160($context)	# restore context->Rbp
   1562 	mov	%r12,216($context)	# restore context->R12
   1563 	mov	%r13,224($context)	# restore context->R13
   1564 	mov	%r14,232($context)	# restore context->R14
   1565 	mov	%r15,240($context)	# restore context->R15
   1566 
   1567 	jmp	.Lcommon_seh_tail
   1568 .size	mul_handler,.-mul_handler
   1569 
   1570 .type	sqr_handler,\@abi-omnipotent
   1571 .align	16
   1572 sqr_handler:
   1573 	push	%rsi
   1574 	push	%rdi
   1575 	push	%rbx
   1576 	push	%rbp
   1577 	push	%r12
   1578 	push	%r13
   1579 	push	%r14
   1580 	push	%r15
   1581 	pushfq
   1582 	sub	\$64,%rsp
   1583 
   1584 	mov	120($context),%rax	# pull context->Rax
   1585 	mov	248($context),%rbx	# pull context->Rip
   1586 
   1587 	lea	.Lsqr4x_body(%rip),%r10
   1588 	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
   1589 	jb	.Lcommon_seh_tail
   1590 
   1591 	mov	152($context),%rax	# pull context->Rsp
   1592 
   1593 	lea	.Lsqr4x_epilogue(%rip),%r10
   1594 	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
   1595 	jae	.Lcommon_seh_tail
   1596 
   1597 	mov	56(%rax),%rax		# pull saved stack pointer
   1598 	lea	48(%rax),%rax
   1599 
   1600 	mov	-8(%rax),%rbx
   1601 	mov	-16(%rax),%rbp
   1602 	mov	-24(%rax),%r12
   1603 	mov	-32(%rax),%r13
   1604 	mov	-40(%rax),%r14
   1605 	mov	-48(%rax),%r15
   1606 	mov	%rbx,144($context)	# restore context->Rbx
   1607 	mov	%rbp,160($context)	# restore context->Rbp
   1608 	mov	%r12,216($context)	# restore context->R12
   1609 	mov	%r13,224($context)	# restore context->R13
   1610 	mov	%r14,232($context)	# restore context->R14
   1611 	mov	%r15,240($context)	# restore context->R15
   1612 
   1613 .Lcommon_seh_tail:
   1614 	mov	8(%rax),%rdi
   1615 	mov	16(%rax),%rsi
   1616 	mov	%rax,152($context)	# restore context->Rsp
   1617 	mov	%rsi,168($context)	# restore context->Rsi
   1618 	mov	%rdi,176($context)	# restore context->Rdi
   1619 
   1620 	mov	40($disp),%rdi		# disp->ContextRecord
   1621 	mov	$context,%rsi		# context
   1622 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1623 	.long	0xa548f3fc		# cld; rep movsq
   1624 
   1625 	mov	$disp,%rsi
   1626 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1627 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1628 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1629 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1630 	mov	40(%rsi),%r10		# disp->ContextRecord
   1631 	lea	56(%rsi),%r11		# &disp->HandlerData
   1632 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1633 	mov	%r10,32(%rsp)		# arg5
   1634 	mov	%r11,40(%rsp)		# arg6
   1635 	mov	%r12,48(%rsp)		# arg7
   1636 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1637 	call	*__imp_RtlVirtualUnwind(%rip)
   1638 
   1639 	mov	\$1,%eax		# ExceptionContinueSearch
   1640 	add	\$64,%rsp
   1641 	popfq
   1642 	pop	%r15
   1643 	pop	%r14
   1644 	pop	%r13
   1645 	pop	%r12
   1646 	pop	%rbp
   1647 	pop	%rbx
   1648 	pop	%rdi
   1649 	pop	%rsi
   1650 	ret
   1651 .size	sqr_handler,.-sqr_handler
   1652 
   1653 .section	.pdata
   1654 .align	4
   1655 	.rva	.LSEH_begin_bn_mul_mont
   1656 	.rva	.LSEH_end_bn_mul_mont
   1657 	.rva	.LSEH_info_bn_mul_mont
   1658 
   1659 	.rva	.LSEH_begin_bn_mul4x_mont
   1660 	.rva	.LSEH_end_bn_mul4x_mont
   1661 	.rva	.LSEH_info_bn_mul4x_mont
   1662 
   1663 	.rva	.LSEH_begin_bn_sqr4x_mont
   1664 	.rva	.LSEH_end_bn_sqr4x_mont
   1665 	.rva	.LSEH_info_bn_sqr4x_mont
   1666 
   1667 .section	.xdata
   1668 .align	8
   1669 .LSEH_info_bn_mul_mont:
   1670 	.byte	9,0,0,0
   1671 	.rva	mul_handler
   1672 	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
   1673 .LSEH_info_bn_mul4x_mont:
   1674 	.byte	9,0,0,0
   1675 	.rva	mul_handler
   1676 	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1677 .LSEH_info_bn_sqr4x_mont:
   1678 	.byte	9,0,0,0
   1679 	.rva	sqr_handler
   1680 ___
   1681 }
   1682 
   1683 print $code;
   1684 close STDOUT;
   1685