Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # October 2005.
     11 #
     12 # Montgomery multiplication routine for x86_64. While it gives modest
     13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
     14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
     15 # respectful 50%. It remains to be seen if loop unrolling and
     16 # dedicated squaring routine can provide further improvement...
     17 
     18 # July 2011.
     19 #
     20 # Add dedicated squaring procedure. Performance improvement varies
     21 # from platform to platform, but in average it's ~5%/15%/25%/33%
     22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     23 
     24 # August 2011.
     25 #
     26 # Unroll and modulo-schedule inner loops in such manner that they
     27 # are "fallen through" for input lengths of 8, which is critical for
     28 # 1024-bit RSA *sign*. Average performance improvement in comparison
     29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
     30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
     31 
     32 $flavour = shift;
     33 $output  = shift;
     34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     35 
     36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     37 
     38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     41 die "can't locate x86_64-xlate.pl";
     42 
     43 open STDOUT,"| $^X $xlate $flavour $output";
     44 
     45 # int bn_mul_mont(
     46 $rp="%rdi";	# BN_ULONG *rp,
     47 $ap="%rsi";	# const BN_ULONG *ap,
     48 $bp="%rdx";	# const BN_ULONG *bp,
     49 $np="%rcx";	# const BN_ULONG *np,
     50 $n0="%r8";	# const BN_ULONG *n0,
     51 $num="%r9";	# int num);
     52 $lo0="%r10";
     53 $hi0="%r11";
     54 $hi1="%r13";
     55 $i="%r14";
     56 $j="%r15";
     57 $m0="%rbx";
     58 $m1="%rbp";
     59 
     60 $code=<<___;
     61 .text
     62 
     63 .globl	bn_mul_mont
     64 .type	bn_mul_mont,\@function,6
     65 .align	16
     66 bn_mul_mont:
     67 	test	\$3,${num}d
     68 	jnz	.Lmul_enter
     69 	cmp	\$8,${num}d
     70 	jb	.Lmul_enter
     71 	cmp	$ap,$bp
     72 	jne	.Lmul4x_enter
     73 	jmp	.Lsqr4x_enter
     74 
     75 .align	16
     76 .Lmul_enter:
     77 	push	%rbx
     78 	push	%rbp
     79 	push	%r12
     80 	push	%r13
     81 	push	%r14
     82 	push	%r15
     83 
     84 	mov	${num}d,${num}d
     85 	lea	2($num),%r10
     86 	mov	%rsp,%r11
     87 	neg	%r10
     88 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
     89 	and	\$-1024,%rsp		# minimize TLB usage
     90 
     91 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
     92 .Lmul_body:
     93 	mov	$bp,%r12		# reassign $bp
     94 ___
     95 		$bp="%r12";
     96 $code.=<<___;
     97 	mov	($n0),$n0		# pull n0[0] value
     98 	mov	($bp),$m0		# m0=bp[0]
     99 	mov	($ap),%rax
    100 
    101 	xor	$i,$i			# i=0
    102 	xor	$j,$j			# j=0
    103 
    104 	mov	$n0,$m1
    105 	mulq	$m0			# ap[0]*bp[0]
    106 	mov	%rax,$lo0
    107 	mov	($np),%rax
    108 
    109 	imulq	$lo0,$m1		# "tp[0]"*n0
    110 	mov	%rdx,$hi0
    111 
    112 	mulq	$m1			# np[0]*m1
    113 	add	%rax,$lo0		# discarded
    114 	mov	8($ap),%rax
    115 	adc	\$0,%rdx
    116 	mov	%rdx,$hi1
    117 
    118 	lea	1($j),$j		# j++
    119 	jmp	.L1st_enter
    120 
    121 .align	16
    122 .L1st:
    123 	add	%rax,$hi1
    124 	mov	($ap,$j,8),%rax
    125 	adc	\$0,%rdx
    126 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    127 	mov	$lo0,$hi0
    128 	adc	\$0,%rdx
    129 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    130 	mov	%rdx,$hi1
    131 
    132 .L1st_enter:
    133 	mulq	$m0			# ap[j]*bp[0]
    134 	add	%rax,$hi0
    135 	mov	($np,$j,8),%rax
    136 	adc	\$0,%rdx
    137 	lea	1($j),$j		# j++
    138 	mov	%rdx,$lo0
    139 
    140 	mulq	$m1			# np[j]*m1
    141 	cmp	$num,$j
    142 	jne	.L1st
    143 
    144 	add	%rax,$hi1
    145 	mov	($ap),%rax		# ap[0]
    146 	adc	\$0,%rdx
    147 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    148 	adc	\$0,%rdx
    149 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    150 	mov	%rdx,$hi1
    151 	mov	$lo0,$hi0
    152 
    153 	xor	%rdx,%rdx
    154 	add	$hi0,$hi1
    155 	adc	\$0,%rdx
    156 	mov	$hi1,-8(%rsp,$num,8)
    157 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    158 
    159 	lea	1($i),$i		# i++
    160 	jmp	.Louter
    161 .align	16
    162 .Louter:
    163 	mov	($bp,$i,8),$m0		# m0=bp[i]
    164 	xor	$j,$j			# j=0
    165 	mov	$n0,$m1
    166 	mov	(%rsp),$lo0
    167 	mulq	$m0			# ap[0]*bp[i]
    168 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    169 	mov	($np),%rax
    170 	adc	\$0,%rdx
    171 
    172 	imulq	$lo0,$m1		# tp[0]*n0
    173 	mov	%rdx,$hi0
    174 
    175 	mulq	$m1			# np[0]*m1
    176 	add	%rax,$lo0		# discarded
    177 	mov	8($ap),%rax
    178 	adc	\$0,%rdx
    179 	mov	8(%rsp),$lo0		# tp[1]
    180 	mov	%rdx,$hi1
    181 
    182 	lea	1($j),$j		# j++
    183 	jmp	.Linner_enter
    184 
    185 .align	16
    186 .Linner:
    187 	add	%rax,$hi1
    188 	mov	($ap,$j,8),%rax
    189 	adc	\$0,%rdx
    190 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    191 	mov	(%rsp,$j,8),$lo0
    192 	adc	\$0,%rdx
    193 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    194 	mov	%rdx,$hi1
    195 
    196 .Linner_enter:
    197 	mulq	$m0			# ap[j]*bp[i]
    198 	add	%rax,$hi0
    199 	mov	($np,$j,8),%rax
    200 	adc	\$0,%rdx
    201 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    202 	mov	%rdx,$hi0
    203 	adc	\$0,$hi0
    204 	lea	1($j),$j		# j++
    205 
    206 	mulq	$m1			# np[j]*m1
    207 	cmp	$num,$j
    208 	jne	.Linner
    209 
    210 	add	%rax,$hi1
    211 	mov	($ap),%rax		# ap[0]
    212 	adc	\$0,%rdx
    213 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    214 	mov	(%rsp,$j,8),$lo0
    215 	adc	\$0,%rdx
    216 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    217 	mov	%rdx,$hi1
    218 
    219 	xor	%rdx,%rdx
    220 	add	$hi0,$hi1
    221 	adc	\$0,%rdx
    222 	add	$lo0,$hi1		# pull upmost overflow bit
    223 	adc	\$0,%rdx
    224 	mov	$hi1,-8(%rsp,$num,8)
    225 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    226 
    227 	lea	1($i),$i		# i++
    228 	cmp	$num,$i
    229 	jl	.Louter
    230 
    231 	xor	$i,$i			# i=0 and clear CF!
    232 	mov	(%rsp),%rax		# tp[0]
    233 	lea	(%rsp),$ap		# borrow ap for tp
    234 	mov	$num,$j			# j=num
    235 	jmp	.Lsub
    236 .align	16
    237 .Lsub:	sbb	($np,$i,8),%rax
    238 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    239 	mov	8($ap,$i,8),%rax	# tp[i+1]
    240 	lea	1($i),$i		# i++
    241 	dec	$j			# doesnn't affect CF!
    242 	jnz	.Lsub
    243 
    244 	sbb	\$0,%rax		# handle upmost overflow bit
    245 	xor	$i,$i
    246 	and	%rax,$ap
    247 	not	%rax
    248 	mov	$rp,$np
    249 	and	%rax,$np
    250 	mov	$num,$j			# j=num
    251 	or	$np,$ap			# ap=borrow?tp:rp
    252 .align	16
    253 .Lcopy:					# copy or in-place refresh
    254 	mov	($ap,$i,8),%rax
    255 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    256 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    257 	lea	1($i),$i
    258 	sub	\$1,$j
    259 	jnz	.Lcopy
    260 
    261 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    262 	mov	\$1,%rax
    263 	mov	(%rsi),%r15
    264 	mov	8(%rsi),%r14
    265 	mov	16(%rsi),%r13
    266 	mov	24(%rsi),%r12
    267 	mov	32(%rsi),%rbp
    268 	mov	40(%rsi),%rbx
    269 	lea	48(%rsi),%rsp
    270 .Lmul_epilogue:
    271 	ret
    272 .size	bn_mul_mont,.-bn_mul_mont
    273 ___
    274 {{{
    275 my @A=("%r10","%r11");
    276 my @N=("%r13","%rdi");
    277 $code.=<<___;
    278 .type	bn_mul4x_mont,\@function,6
    279 .align	16
    280 bn_mul4x_mont:
    281 .Lmul4x_enter:
    282 	push	%rbx
    283 	push	%rbp
    284 	push	%r12
    285 	push	%r13
    286 	push	%r14
    287 	push	%r15
    288 
    289 	mov	${num}d,${num}d
    290 	lea	4($num),%r10
    291 	mov	%rsp,%r11
    292 	neg	%r10
    293 	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
    294 	and	\$-1024,%rsp		# minimize TLB usage
    295 
    296 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
    297 .Lmul4x_body:
    298 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    299 	mov	%rdx,%r12		# reassign $bp
    300 ___
    301 		$bp="%r12";
    302 $code.=<<___;
    303 	mov	($n0),$n0		# pull n0[0] value
    304 	mov	($bp),$m0		# m0=bp[0]
    305 	mov	($ap),%rax
    306 
    307 	xor	$i,$i			# i=0
    308 	xor	$j,$j			# j=0
    309 
    310 	mov	$n0,$m1
    311 	mulq	$m0			# ap[0]*bp[0]
    312 	mov	%rax,$A[0]
    313 	mov	($np),%rax
    314 
    315 	imulq	$A[0],$m1		# "tp[0]"*n0
    316 	mov	%rdx,$A[1]
    317 
    318 	mulq	$m1			# np[0]*m1
    319 	add	%rax,$A[0]		# discarded
    320 	mov	8($ap),%rax
    321 	adc	\$0,%rdx
    322 	mov	%rdx,$N[1]
    323 
    324 	mulq	$m0
    325 	add	%rax,$A[1]
    326 	mov	8($np),%rax
    327 	adc	\$0,%rdx
    328 	mov	%rdx,$A[0]
    329 
    330 	mulq	$m1
    331 	add	%rax,$N[1]
    332 	mov	16($ap),%rax
    333 	adc	\$0,%rdx
    334 	add	$A[1],$N[1]
    335 	lea	4($j),$j		# j++
    336 	adc	\$0,%rdx
    337 	mov	$N[1],(%rsp)
    338 	mov	%rdx,$N[0]
    339 	jmp	.L1st4x
    340 .align	16
    341 .L1st4x:
    342 	mulq	$m0			# ap[j]*bp[0]
    343 	add	%rax,$A[0]
    344 	mov	-16($np,$j,8),%rax
    345 	adc	\$0,%rdx
    346 	mov	%rdx,$A[1]
    347 
    348 	mulq	$m1			# np[j]*m1
    349 	add	%rax,$N[0]
    350 	mov	-8($ap,$j,8),%rax
    351 	adc	\$0,%rdx
    352 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    353 	adc	\$0,%rdx
    354 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    355 	mov	%rdx,$N[1]
    356 
    357 	mulq	$m0			# ap[j]*bp[0]
    358 	add	%rax,$A[1]
    359 	mov	-8($np,$j,8),%rax
    360 	adc	\$0,%rdx
    361 	mov	%rdx,$A[0]
    362 
    363 	mulq	$m1			# np[j]*m1
    364 	add	%rax,$N[1]
    365 	mov	($ap,$j,8),%rax
    366 	adc	\$0,%rdx
    367 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    368 	adc	\$0,%rdx
    369 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    370 	mov	%rdx,$N[0]
    371 
    372 	mulq	$m0			# ap[j]*bp[0]
    373 	add	%rax,$A[0]
    374 	mov	($np,$j,8),%rax
    375 	adc	\$0,%rdx
    376 	mov	%rdx,$A[1]
    377 
    378 	mulq	$m1			# np[j]*m1
    379 	add	%rax,$N[0]
    380 	mov	8($ap,$j,8),%rax
    381 	adc	\$0,%rdx
    382 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    383 	adc	\$0,%rdx
    384 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    385 	mov	%rdx,$N[1]
    386 
    387 	mulq	$m0			# ap[j]*bp[0]
    388 	add	%rax,$A[1]
    389 	mov	8($np,$j,8),%rax
    390 	adc	\$0,%rdx
    391 	lea	4($j),$j		# j++
    392 	mov	%rdx,$A[0]
    393 
    394 	mulq	$m1			# np[j]*m1
    395 	add	%rax,$N[1]
    396 	mov	-16($ap,$j,8),%rax
    397 	adc	\$0,%rdx
    398 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    399 	adc	\$0,%rdx
    400 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    401 	mov	%rdx,$N[0]
    402 	cmp	$num,$j
    403 	jl	.L1st4x
    404 
    405 	mulq	$m0			# ap[j]*bp[0]
    406 	add	%rax,$A[0]
    407 	mov	-16($np,$j,8),%rax
    408 	adc	\$0,%rdx
    409 	mov	%rdx,$A[1]
    410 
    411 	mulq	$m1			# np[j]*m1
    412 	add	%rax,$N[0]
    413 	mov	-8($ap,$j,8),%rax
    414 	adc	\$0,%rdx
    415 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    416 	adc	\$0,%rdx
    417 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    418 	mov	%rdx,$N[1]
    419 
    420 	mulq	$m0			# ap[j]*bp[0]
    421 	add	%rax,$A[1]
    422 	mov	-8($np,$j,8),%rax
    423 	adc	\$0,%rdx
    424 	mov	%rdx,$A[0]
    425 
    426 	mulq	$m1			# np[j]*m1
    427 	add	%rax,$N[1]
    428 	mov	($ap),%rax		# ap[0]
    429 	adc	\$0,%rdx
    430 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    431 	adc	\$0,%rdx
    432 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    433 	mov	%rdx,$N[0]
    434 
    435 	xor	$N[1],$N[1]
    436 	add	$A[0],$N[0]
    437 	adc	\$0,$N[1]
    438 	mov	$N[0],-8(%rsp,$j,8)
    439 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    440 
    441 	lea	1($i),$i		# i++
    442 .align	4
    443 .Louter4x:
    444 	mov	($bp,$i,8),$m0		# m0=bp[i]
    445 	xor	$j,$j			# j=0
    446 	mov	(%rsp),$A[0]
    447 	mov	$n0,$m1
    448 	mulq	$m0			# ap[0]*bp[i]
    449 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    450 	mov	($np),%rax
    451 	adc	\$0,%rdx
    452 
    453 	imulq	$A[0],$m1		# tp[0]*n0
    454 	mov	%rdx,$A[1]
    455 
    456 	mulq	$m1			# np[0]*m1
    457 	add	%rax,$A[0]		# "$N[0]", discarded
    458 	mov	8($ap),%rax
    459 	adc	\$0,%rdx
    460 	mov	%rdx,$N[1]
    461 
    462 	mulq	$m0			# ap[j]*bp[i]
    463 	add	%rax,$A[1]
    464 	mov	8($np),%rax
    465 	adc	\$0,%rdx
    466 	add	8(%rsp),$A[1]		# +tp[1]
    467 	adc	\$0,%rdx
    468 	mov	%rdx,$A[0]
    469 
    470 	mulq	$m1			# np[j]*m1
    471 	add	%rax,$N[1]
    472 	mov	16($ap),%rax
    473 	adc	\$0,%rdx
    474 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    475 	lea	4($j),$j		# j+=2
    476 	adc	\$0,%rdx
    477 	mov	$N[1],(%rsp)		# tp[j-1]
    478 	mov	%rdx,$N[0]
    479 	jmp	.Linner4x
    480 .align	16
    481 .Linner4x:
    482 	mulq	$m0			# ap[j]*bp[i]
    483 	add	%rax,$A[0]
    484 	mov	-16($np,$j,8),%rax
    485 	adc	\$0,%rdx
    486 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    487 	adc	\$0,%rdx
    488 	mov	%rdx,$A[1]
    489 
    490 	mulq	$m1			# np[j]*m1
    491 	add	%rax,$N[0]
    492 	mov	-8($ap,$j,8),%rax
    493 	adc	\$0,%rdx
    494 	add	$A[0],$N[0]
    495 	adc	\$0,%rdx
    496 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    497 	mov	%rdx,$N[1]
    498 
    499 	mulq	$m0			# ap[j]*bp[i]
    500 	add	%rax,$A[1]
    501 	mov	-8($np,$j,8),%rax
    502 	adc	\$0,%rdx
    503 	add	-8(%rsp,$j,8),$A[1]
    504 	adc	\$0,%rdx
    505 	mov	%rdx,$A[0]
    506 
    507 	mulq	$m1			# np[j]*m1
    508 	add	%rax,$N[1]
    509 	mov	($ap,$j,8),%rax
    510 	adc	\$0,%rdx
    511 	add	$A[1],$N[1]
    512 	adc	\$0,%rdx
    513 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    514 	mov	%rdx,$N[0]
    515 
    516 	mulq	$m0			# ap[j]*bp[i]
    517 	add	%rax,$A[0]
    518 	mov	($np,$j,8),%rax
    519 	adc	\$0,%rdx
    520 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    521 	adc	\$0,%rdx
    522 	mov	%rdx,$A[1]
    523 
    524 	mulq	$m1			# np[j]*m1
    525 	add	%rax,$N[0]
    526 	mov	8($ap,$j,8),%rax
    527 	adc	\$0,%rdx
    528 	add	$A[0],$N[0]
    529 	adc	\$0,%rdx
    530 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    531 	mov	%rdx,$N[1]
    532 
    533 	mulq	$m0			# ap[j]*bp[i]
    534 	add	%rax,$A[1]
    535 	mov	8($np,$j,8),%rax
    536 	adc	\$0,%rdx
    537 	add	8(%rsp,$j,8),$A[1]
    538 	adc	\$0,%rdx
    539 	lea	4($j),$j		# j++
    540 	mov	%rdx,$A[0]
    541 
    542 	mulq	$m1			# np[j]*m1
    543 	add	%rax,$N[1]
    544 	mov	-16($ap,$j,8),%rax
    545 	adc	\$0,%rdx
    546 	add	$A[1],$N[1]
    547 	adc	\$0,%rdx
    548 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    549 	mov	%rdx,$N[0]
    550 	cmp	$num,$j
    551 	jl	.Linner4x
    552 
    553 	mulq	$m0			# ap[j]*bp[i]
    554 	add	%rax,$A[0]
    555 	mov	-16($np,$j,8),%rax
    556 	adc	\$0,%rdx
    557 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    558 	adc	\$0,%rdx
    559 	mov	%rdx,$A[1]
    560 
    561 	mulq	$m1			# np[j]*m1
    562 	add	%rax,$N[0]
    563 	mov	-8($ap,$j,8),%rax
    564 	adc	\$0,%rdx
    565 	add	$A[0],$N[0]
    566 	adc	\$0,%rdx
    567 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    568 	mov	%rdx,$N[1]
    569 
    570 	mulq	$m0			# ap[j]*bp[i]
    571 	add	%rax,$A[1]
    572 	mov	-8($np,$j,8),%rax
    573 	adc	\$0,%rdx
    574 	add	-8(%rsp,$j,8),$A[1]
    575 	adc	\$0,%rdx
    576 	lea	1($i),$i		# i++
    577 	mov	%rdx,$A[0]
    578 
    579 	mulq	$m1			# np[j]*m1
    580 	add	%rax,$N[1]
    581 	mov	($ap),%rax		# ap[0]
    582 	adc	\$0,%rdx
    583 	add	$A[1],$N[1]
    584 	adc	\$0,%rdx
    585 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    586 	mov	%rdx,$N[0]
    587 
    588 	xor	$N[1],$N[1]
    589 	add	$A[0],$N[0]
    590 	adc	\$0,$N[1]
    591 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    592 	adc	\$0,$N[1]
    593 	mov	$N[0],-8(%rsp,$j,8)
    594 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    595 
    596 	cmp	$num,$i
    597 	jl	.Louter4x
    598 ___
    599 {
    600 my @ri=("%rax","%rdx",$m0,$m1);
    601 $code.=<<___;
    602 	mov	16(%rsp,$num,8),$rp	# restore $rp
    603 	mov	0(%rsp),@ri[0]		# tp[0]
    604 	pxor	%xmm0,%xmm0
    605 	mov	8(%rsp),@ri[1]		# tp[1]
    606 	shr	\$2,$num		# num/=4
    607 	lea	(%rsp),$ap		# borrow ap for tp
    608 	xor	$i,$i			# i=0 and clear CF!
    609 
    610 	sub	0($np),@ri[0]
    611 	mov	16($ap),@ri[2]		# tp[2]
    612 	mov	24($ap),@ri[3]		# tp[3]
    613 	sbb	8($np),@ri[1]
    614 	lea	-1($num),$j		# j=num/4-1
    615 	jmp	.Lsub4x
    616 .align	16
    617 .Lsub4x:
    618 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    619 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    620 	sbb	16($np,$i,8),@ri[2]
    621 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    622 	mov	40($ap,$i,8),@ri[1]
    623 	sbb	24($np,$i,8),@ri[3]
    624 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    625 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    626 	sbb	32($np,$i,8),@ri[0]
    627 	mov	48($ap,$i,8),@ri[2]
    628 	mov	56($ap,$i,8),@ri[3]
    629 	sbb	40($np,$i,8),@ri[1]
    630 	lea	4($i),$i		# i++
    631 	dec	$j			# doesnn't affect CF!
    632 	jnz	.Lsub4x
    633 
    634 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    635 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    636 	sbb	16($np,$i,8),@ri[2]
    637 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    638 	sbb	24($np,$i,8),@ri[3]
    639 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    640 
    641 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    642 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    643 	xor	$i,$i			# i=0
    644 	and	@ri[0],$ap
    645 	not	@ri[0]
    646 	mov	$rp,$np
    647 	and	@ri[0],$np
    648 	lea	-1($num),$j
    649 	or	$np,$ap			# ap=borrow?tp:rp
    650 
    651 	movdqu	($ap),%xmm1
    652 	movdqa	%xmm0,(%rsp)
    653 	movdqu	%xmm1,($rp)
    654 	jmp	.Lcopy4x
    655 .align	16
    656 .Lcopy4x:					# copy or in-place refresh
    657 	movdqu	16($ap,$i),%xmm2
    658 	movdqu	32($ap,$i),%xmm1
    659 	movdqa	%xmm0,16(%rsp,$i)
    660 	movdqu	%xmm2,16($rp,$i)
    661 	movdqa	%xmm0,32(%rsp,$i)
    662 	movdqu	%xmm1,32($rp,$i)
    663 	lea	32($i),$i
    664 	dec	$j
    665 	jnz	.Lcopy4x
    666 
    667 	shl	\$2,$num
    668 	movdqu	16($ap,$i),%xmm2
    669 	movdqa	%xmm0,16(%rsp,$i)
    670 	movdqu	%xmm2,16($rp,$i)
    671 ___
    672 }
    673 $code.=<<___;
    674 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    675 	mov	\$1,%rax
    676 	mov	(%rsi),%r15
    677 	mov	8(%rsi),%r14
    678 	mov	16(%rsi),%r13
    679 	mov	24(%rsi),%r12
    680 	mov	32(%rsi),%rbp
    681 	mov	40(%rsi),%rbx
    682 	lea	48(%rsi),%rsp
    683 .Lmul4x_epilogue:
    684 	ret
    685 .size	bn_mul4x_mont,.-bn_mul4x_mont
    686 ___
    687 }}}
    688 {{{
    690 ######################################################################
    691 # void bn_sqr4x_mont(
    692 my $rptr="%rdi";	# const BN_ULONG *rptr,
    693 my $aptr="%rsi";	# const BN_ULONG *aptr,
    694 my $bptr="%rdx";	# not used
    695 my $nptr="%rcx";	# const BN_ULONG *nptr,
    696 my $n0  ="%r8";		# const BN_ULONG *n0);
    697 my $num ="%r9";		# int num, has to be divisible by 4 and
    698 			# not less than 8
    699 
    700 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
    701 my @A0=("%r10","%r11");
    702 my @A1=("%r12","%r13");
    703 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
    704 
    705 $code.=<<___;
    706 .type	bn_sqr4x_mont,\@function,6
    707 .align	16
    708 bn_sqr4x_mont:
    709 .Lsqr4x_enter:
    710 	push	%rbx
    711 	push	%rbp
    712 	push	%r12
    713 	push	%r13
    714 	push	%r14
    715 	push	%r15
    716 
    717 	shl	\$3,${num}d		# convert $num to bytes
    718 	xor	%r10,%r10
    719 	mov	%rsp,%r11		# put aside %rsp
    720 	sub	$num,%r10		# -$num
    721 	mov	($n0),$n0		# *n0
    722 	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
    723 	and	\$-1024,%rsp		# minimize TLB usage
    724 	##############################################################
    725 	# Stack layout
    726 	#
    727 	# +0	saved $num, used in reduction section
    728 	# +8	&t[2*$num], used in reduction section
    729 	# +32	saved $rptr
    730 	# +40	saved $nptr
    731 	# +48	saved *n0
    732 	# +56	saved %rsp
    733 	# +64	t[2*$num]
    734 	#
    735 	mov	$rptr,32(%rsp)		# save $rptr
    736 	mov	$nptr,40(%rsp)
    737 	mov	$n0,  48(%rsp)
    738 	mov	%r11, 56(%rsp)		# save original %rsp
    739 .Lsqr4x_body:
    740 	##############################################################
    741 	# Squaring part:
    742 	#
    743 	# a) multiply-n-add everything but a[i]*a[i];
    744 	# b) shift result of a) by 1 to the left and accumulate
    745 	#    a[i]*a[i] products;
    746 	#
    747 	lea	32(%r10),$i		# $i=-($num-32)
    748 	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
    749 
    750 	mov	$num,$j			# $j=$num
    751 
    752 					# comments apply to $num==8 case
    753 	mov	-32($aptr,$i),$a0	# a[0]
    754 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    755 	mov	-24($aptr,$i),%rax	# a[1]
    756 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    757 	mov	-16($aptr,$i),$ai	# a[2]
    758 	mov	%rax,$a1
    759 
    760 	mul	$a0			# a[1]*a[0]
    761 	mov	%rax,$A0[0]		# a[1]*a[0]
    762 	 mov	$ai,%rax		# a[2]
    763 	mov	%rdx,$A0[1]
    764 	mov	$A0[0],-24($tptr,$i)	# t[1]
    765 
    766 	xor	$A0[0],$A0[0]
    767 	mul	$a0			# a[2]*a[0]
    768 	add	%rax,$A0[1]
    769 	 mov	$ai,%rax
    770 	adc	%rdx,$A0[0]
    771 	mov	$A0[1],-16($tptr,$i)	# t[2]
    772 
    773 	lea	-16($i),$j		# j=-16
    774 
    775 
    776 	 mov	8($aptr,$j),$ai		# a[3]
    777 	mul	$a1			# a[2]*a[1]
    778 	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
    779 	 mov	$ai,%rax
    780 	mov	%rdx,$A1[1]
    781 
    782 	xor	$A0[1],$A0[1]
    783 	add	$A1[0],$A0[0]
    784 	 lea	16($j),$j
    785 	adc	\$0,$A0[1]
    786 	mul	$a0			# a[3]*a[0]
    787 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
    788 	 mov	$ai,%rax
    789 	adc	%rdx,$A0[1]
    790 	mov	$A0[0],-8($tptr,$j)	# t[3]
    791 	jmp	.Lsqr4x_1st
    792 
    793 .align	16
    794 .Lsqr4x_1st:
    795 	 mov	($aptr,$j),$ai		# a[4]
    796 	xor	$A1[0],$A1[0]
    797 	mul	$a1			# a[3]*a[1]
    798 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
    799 	 mov	$ai,%rax
    800 	adc	%rdx,$A1[0]
    801 
    802 	xor	$A0[0],$A0[0]
    803 	add	$A1[1],$A0[1]
    804 	adc	\$0,$A0[0]
    805 	mul	$a0			# a[4]*a[0]
    806 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
    807 	 mov	$ai,%rax		# a[3]
    808 	adc	%rdx,$A0[0]
    809 	mov	$A0[1],($tptr,$j)	# t[4]
    810 
    811 
    812 	 mov	8($aptr,$j),$ai		# a[5]
    813 	xor	$A1[1],$A1[1]
    814 	mul	$a1			# a[4]*a[3]
    815 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
    816 	 mov	$ai,%rax
    817 	adc	%rdx,$A1[1]
    818 
    819 	xor	$A0[1],$A0[1]
    820 	add	$A1[0],$A0[0]
    821 	adc	\$0,$A0[1]
    822 	mul	$a0			# a[5]*a[2]
    823 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
    824 	 mov	$ai,%rax
    825 	adc	%rdx,$A0[1]
    826 	mov	$A0[0],8($tptr,$j)	# t[5]
    827 
    828 	 mov	16($aptr,$j),$ai	# a[6]
    829 	xor	$A1[0],$A1[0]
    830 	mul	$a1			# a[5]*a[3]
    831 	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
    832 	 mov	$ai,%rax
    833 	adc	%rdx,$A1[0]
    834 
    835 	xor	$A0[0],$A0[0]
    836 	add	$A1[1],$A0[1]
    837 	adc	\$0,$A0[0]
    838 	mul	$a0			# a[6]*a[2]
    839 	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
    840 	 mov	$ai,%rax		# a[3]
    841 	adc	%rdx,$A0[0]
    842 	mov	$A0[1],16($tptr,$j)	# t[6]
    843 
    844 
    845 	 mov	24($aptr,$j),$ai	# a[7]
    846 	xor	$A1[1],$A1[1]
    847 	mul	$a1			# a[6]*a[5]
    848 	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
    849 	 mov	$ai,%rax
    850 	adc	%rdx,$A1[1]
    851 
    852 	xor	$A0[1],$A0[1]
    853 	add	$A1[0],$A0[0]
    854 	 lea	32($j),$j
    855 	adc	\$0,$A0[1]
    856 	mul	$a0			# a[7]*a[4]
    857 	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
    858 	 mov	$ai,%rax
    859 	adc	%rdx,$A0[1]
    860 	mov	$A0[0],-8($tptr,$j)	# t[7]
    861 
    862 	cmp	\$0,$j
    863 	jne	.Lsqr4x_1st
    864 
    865 	xor	$A1[0],$A1[0]
    866 	add	$A0[1],$A1[1]
    867 	adc	\$0,$A1[0]
    868 	mul	$a1			# a[7]*a[5]
    869 	add	%rax,$A1[1]
    870 	adc	%rdx,$A1[0]
    871 
    872 	mov	$A1[1],($tptr)		# t[8]
    873 	lea	16($i),$i
    874 	mov	$A1[0],8($tptr)		# t[9]
    875 	jmp	.Lsqr4x_outer
    876 
    877 .align	16
    878 .Lsqr4x_outer:				# comments apply to $num==6 case
    879 	mov	-32($aptr,$i),$a0	# a[0]
    880 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    881 	mov	-24($aptr,$i),%rax	# a[1]
    882 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    883 	mov	-16($aptr,$i),$ai	# a[2]
    884 	mov	%rax,$a1
    885 
    886 	mov	-24($tptr,$i),$A0[0]	# t[1]
    887 	xor	$A0[1],$A0[1]
    888 	mul	$a0			# a[1]*a[0]
    889 	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
    890 	 mov	$ai,%rax		# a[2]
    891 	adc	%rdx,$A0[1]
    892 	mov	$A0[0],-24($tptr,$i)	# t[1]
    893 
    894 	xor	$A0[0],$A0[0]
    895 	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
    896 	adc	\$0,$A0[0]
    897 	mul	$a0			# a[2]*a[0]
    898 	add	%rax,$A0[1]
    899 	 mov	$ai,%rax
    900 	adc	%rdx,$A0[0]
    901 	mov	$A0[1],-16($tptr,$i)	# t[2]
    902 
    903 	lea	-16($i),$j		# j=-16
    904 	xor	$A1[0],$A1[0]
    905 
    906 
    907 	 mov	8($aptr,$j),$ai		# a[3]
    908 	xor	$A1[1],$A1[1]
    909 	add	8($tptr,$j),$A1[0]
    910 	adc	\$0,$A1[1]
    911 	mul	$a1			# a[2]*a[1]
    912 	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
    913 	 mov	$ai,%rax
    914 	adc	%rdx,$A1[1]
    915 
    916 	xor	$A0[1],$A0[1]
    917 	add	$A1[0],$A0[0]
    918 	adc	\$0,$A0[1]
    919 	mul	$a0			# a[3]*a[0]
    920 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
    921 	 mov	$ai,%rax
    922 	adc	%rdx,$A0[1]
    923 	mov	$A0[0],8($tptr,$j)	# t[3]
    924 
    925 	lea	16($j),$j
    926 	jmp	.Lsqr4x_inner
    927 
    928 .align	16
    929 .Lsqr4x_inner:
    930 	 mov	($aptr,$j),$ai		# a[4]
    931 	xor	$A1[0],$A1[0]
    932 	add	($tptr,$j),$A1[1]
    933 	adc	\$0,$A1[0]
    934 	mul	$a1			# a[3]*a[1]
    935 	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
    936 	 mov	$ai,%rax
    937 	adc	%rdx,$A1[0]
    938 
    939 	xor	$A0[0],$A0[0]
    940 	add	$A1[1],$A0[1]
    941 	adc	\$0,$A0[0]
    942 	mul	$a0			# a[4]*a[0]
    943 	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
    944 	 mov	$ai,%rax		# a[3]
    945 	adc	%rdx,$A0[0]
    946 	mov	$A0[1],($tptr,$j)	# t[4]
    947 
    948 	 mov	8($aptr,$j),$ai		# a[5]
    949 	xor	$A1[1],$A1[1]
    950 	add	8($tptr,$j),$A1[0]
    951 	adc	\$0,$A1[1]
    952 	mul	$a1			# a[4]*a[3]
    953 	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
    954 	 mov	$ai,%rax
    955 	adc	%rdx,$A1[1]
    956 
    957 	xor	$A0[1],$A0[1]
    958 	add	$A1[0],$A0[0]
    959 	lea	16($j),$j		# j++
    960 	adc	\$0,$A0[1]
    961 	mul	$a0			# a[5]*a[2]
    962 	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
    963 	 mov	$ai,%rax
    964 	adc	%rdx,$A0[1]
    965 	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
    966 
    967 	cmp	\$0,$j
    968 	jne	.Lsqr4x_inner
    969 
    970 	xor	$A1[0],$A1[0]
    971 	add	$A0[1],$A1[1]
    972 	adc	\$0,$A1[0]
    973 	mul	$a1			# a[5]*a[3]
    974 	add	%rax,$A1[1]
    975 	adc	%rdx,$A1[0]
    976 
    977 	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
    978 	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
    979 
    980 	add	\$16,$i
    981 	jnz	.Lsqr4x_outer
    982 
    983 					# comments apply to $num==4 case
    984 	mov	-32($aptr),$a0		# a[0]
    985 	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
    986 	mov	-24($aptr),%rax		# a[1]
    987 	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
    988 	mov	-16($aptr),$ai		# a[2]
    989 	mov	%rax,$a1
    990 
    991 	xor	$A0[1],$A0[1]
    992 	mul	$a0			# a[1]*a[0]
    993 	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
    994 	 mov	$ai,%rax		# a[2]
    995 	adc	%rdx,$A0[1]
    996 	mov	$A0[0],-24($tptr)	# t[1]
    997 
    998 	xor	$A0[0],$A0[0]
    999 	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
   1000 	adc	\$0,$A0[0]
   1001 	mul	$a0			# a[2]*a[0]
   1002 	add	%rax,$A0[1]
   1003 	 mov	$ai,%rax
   1004 	adc	%rdx,$A0[0]
   1005 	mov	$A0[1],-16($tptr)	# t[2]
   1006 
   1007 	 mov	-8($aptr),$ai		# a[3]
   1008 	mul	$a1			# a[2]*a[1]
   1009 	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
   1010 	 mov	$ai,%rax
   1011 	adc	\$0,%rdx
   1012 
   1013 	xor	$A0[1],$A0[1]
   1014 	add	$A1[0],$A0[0]
   1015 	 mov	%rdx,$A1[1]
   1016 	adc	\$0,$A0[1]
   1017 	mul	$a0			# a[3]*a[0]
   1018 	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
   1019 	 mov	$ai,%rax
   1020 	adc	%rdx,$A0[1]
   1021 	mov	$A0[0],-8($tptr)	# t[3]
   1022 
   1023 	xor	$A1[0],$A1[0]
   1024 	add	$A0[1],$A1[1]
   1025 	adc	\$0,$A1[0]
   1026 	mul	$a1			# a[3]*a[1]
   1027 	add	%rax,$A1[1]
   1028 	 mov	-16($aptr),%rax		# a[2]
   1029 	adc	%rdx,$A1[0]
   1030 
   1031 	mov	$A1[1],($tptr)		# t[4]
   1032 	mov	$A1[0],8($tptr)		# t[5]
   1033 
   1034 	mul	$ai			# a[2]*a[3]
   1035 ___
   1036 {
   1037 my ($shift,$carry)=($a0,$a1);
   1038 my @S=(@A1,$ai,$n0);
   1039 $code.=<<___;
   1040 	 add	\$16,$i
   1041 	 xor	$shift,$shift
   1042 	 sub	$num,$i			# $i=16-$num
   1043 	 xor	$carry,$carry
   1044 
   1045 	add	$A1[0],%rax		# t[5]
   1046 	adc	\$0,%rdx
   1047 	mov	%rax,8($tptr)		# t[5]
   1048 	mov	%rdx,16($tptr)		# t[6]
   1049 	mov	$carry,24($tptr)	# t[7]
   1050 
   1051 	 mov	-16($aptr,$i),%rax	# a[0]
   1052 	lea	64(%rsp,$num,2),$tptr
   1053 	 xor	$A0[0],$A0[0]		# t[0]
   1054 	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
   1055 
   1056 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1057 	shr	\$63,$A0[0]
   1058 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1059 	shr	\$63,$A0[1]
   1060 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1061 	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1062 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1063 	mul	%rax			# a[i]*a[i]
   1064 	neg	$carry			# mov $carry,cf
   1065 	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1066 	adc	%rax,$S[0]
   1067 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1068 	mov	$S[0],-32($tptr,$i,2)
   1069 	adc	%rdx,$S[1]
   1070 
   1071 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1072 	 mov	$S[1],-24($tptr,$i,2)
   1073 	 sbb	$carry,$carry		# mov cf,$carry
   1074 	shr	\$63,$A0[0]
   1075 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1076 	shr	\$63,$A0[1]
   1077 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1078 	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1079 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1080 	mul	%rax			# a[i]*a[i]
   1081 	neg	$carry			# mov $carry,cf
   1082 	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1083 	adc	%rax,$S[2]
   1084 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1085 	mov	$S[2],-16($tptr,$i,2)
   1086 	adc	%rdx,$S[3]
   1087 	lea	16($i),$i
   1088 	mov	$S[3],-40($tptr,$i,2)
   1089 	sbb	$carry,$carry		# mov cf,$carry
   1090 	jmp	.Lsqr4x_shift_n_add
   1091 
   1092 .align	16
   1093 .Lsqr4x_shift_n_add:
   1094 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1095 	shr	\$63,$A0[0]
   1096 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1097 	shr	\$63,$A0[1]
   1098 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1099 	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1100 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1101 	mul	%rax			# a[i]*a[i]
   1102 	neg	$carry			# mov $carry,cf
   1103 	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1104 	adc	%rax,$S[0]
   1105 	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
   1106 	mov	$S[0],-32($tptr,$i,2)
   1107 	adc	%rdx,$S[1]
   1108 
   1109 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1110 	 mov	$S[1],-24($tptr,$i,2)
   1111 	 sbb	$carry,$carry		# mov cf,$carry
   1112 	shr	\$63,$A0[0]
   1113 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1114 	shr	\$63,$A0[1]
   1115 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1116 	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1117 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1118 	mul	%rax			# a[i]*a[i]
   1119 	neg	$carry			# mov $carry,cf
   1120 	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1121 	adc	%rax,$S[2]
   1122 	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
   1123 	mov	$S[2],-16($tptr,$i,2)
   1124 	adc	%rdx,$S[3]
   1125 
   1126 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1127 	 mov	$S[3],-8($tptr,$i,2)
   1128 	 sbb	$carry,$carry		# mov cf,$carry
   1129 	shr	\$63,$A0[0]
   1130 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1131 	shr	\$63,$A0[1]
   1132 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1133 	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1134 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1135 	mul	%rax			# a[i]*a[i]
   1136 	neg	$carry			# mov $carry,cf
   1137 	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1138 	adc	%rax,$S[0]
   1139 	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
   1140 	mov	$S[0],0($tptr,$i,2)
   1141 	adc	%rdx,$S[1]
   1142 
   1143 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
   1144 	 mov	$S[1],8($tptr,$i,2)
   1145 	 sbb	$carry,$carry		# mov cf,$carry
   1146 	shr	\$63,$A0[0]
   1147 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1148 	shr	\$63,$A0[1]
   1149 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1150 	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
   1151 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1152 	mul	%rax			# a[i]*a[i]
   1153 	neg	$carry			# mov $carry,cf
   1154 	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
   1155 	adc	%rax,$S[2]
   1156 	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
   1157 	mov	$S[2],16($tptr,$i,2)
   1158 	adc	%rdx,$S[3]
   1159 	mov	$S[3],24($tptr,$i,2)
   1160 	sbb	$carry,$carry		# mov cf,$carry
   1161 	add	\$32,$i
   1162 	jnz	.Lsqr4x_shift_n_add
   1163 
   1164 	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
   1165 	shr	\$63,$A0[0]
   1166 	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
   1167 	shr	\$63,$A0[1]
   1168 	or	$A0[0],$S[1]		# | t[2*i]>>63
   1169 	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
   1170 	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
   1171 	mul	%rax			# a[i]*a[i]
   1172 	neg	$carry			# mov $carry,cf
   1173 	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
   1174 	adc	%rax,$S[0]
   1175 	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
   1176 	mov	$S[0],-32($tptr)
   1177 	adc	%rdx,$S[1]
   1178 
   1179 	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
   1180 	 mov	$S[1],-24($tptr)
   1181 	 sbb	$carry,$carry		# mov cf,$carry
   1182 	shr	\$63,$A0[0]
   1183 	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
   1184 	shr	\$63,$A0[1]
   1185 	or	$A0[0],$S[3]		# | t[2*i]>>63
   1186 	mul	%rax			# a[i]*a[i]
   1187 	neg	$carry			# mov $carry,cf
   1188 	adc	%rax,$S[2]
   1189 	adc	%rdx,$S[3]
   1190 	mov	$S[2],-16($tptr)
   1191 	mov	$S[3],-8($tptr)
   1192 ___
   1193 }
   1195 ##############################################################
   1196 # Montgomery reduction part, "word-by-word" algorithm.
   1197 #
   1198 {
   1199 my ($topbit,$nptr)=("%rbp",$aptr);
   1200 my ($m0,$m1)=($a0,$a1);
   1201 my @Ni=("%rbx","%r9");
   1202 $code.=<<___;
   1203 	mov	40(%rsp),$nptr		# restore $nptr
   1204 	mov	48(%rsp),$n0		# restore *n0
   1205 	xor	$j,$j
   1206 	mov	$num,0(%rsp)		# save $num
   1207 	sub	$num,$j			# $j=-$num
   1208 	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
   1209 	 mov	$n0,$m0			#		# modsched #
   1210 	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
   1211 	lea	64(%rsp,$num),$tptr	# end of t[] window
   1212 	mov	%rax,8(%rsp)		# save end of t[] buffer
   1213 	lea	($nptr,$num),$nptr	# end of n[] buffer
   1214 	xor	$topbit,$topbit		# $topbit=0
   1215 
   1216 	mov	0($nptr,$j),%rax	# n[0]		# modsched #
   1217 	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
   1218 	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
   1219 	 mov	%rax,$Ni[0]		#		# modsched #
   1220 	jmp	.Lsqr4x_mont_outer
   1221 
   1222 .align	16
   1223 .Lsqr4x_mont_outer:
   1224 	xor	$A0[1],$A0[1]
   1225 	mul	$m0			# n[0]*m0
   1226 	add	%rax,$A0[0]		# n[0]*m0+t[0]
   1227 	 mov	$Ni[1],%rax
   1228 	adc	%rdx,$A0[1]
   1229 	mov	$n0,$m1
   1230 
   1231 	xor	$A0[0],$A0[0]
   1232 	add	8($tptr,$j),$A0[1]
   1233 	adc	\$0,$A0[0]
   1234 	mul	$m0			# n[1]*m0
   1235 	add	%rax,$A0[1]		# n[1]*m0+t[1]
   1236 	 mov	$Ni[0],%rax
   1237 	adc	%rdx,$A0[0]
   1238 
   1239 	imulq	$A0[1],$m1
   1240 
   1241 	mov	16($nptr,$j),$Ni[0]	# n[2]
   1242 	xor	$A1[1],$A1[1]
   1243 	add	$A0[1],$A1[0]
   1244 	adc	\$0,$A1[1]
   1245 	mul	$m1			# n[0]*m1
   1246 	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
   1247 	 mov	$Ni[0],%rax
   1248 	adc	%rdx,$A1[1]
   1249 	mov	$A1[0],8($tptr,$j)	# "t[1]"
   1250 
   1251 	xor	$A0[1],$A0[1]
   1252 	add	16($tptr,$j),$A0[0]
   1253 	adc	\$0,$A0[1]
   1254 	mul	$m0			# n[2]*m0
   1255 	add	%rax,$A0[0]		# n[2]*m0+t[2]
   1256 	 mov	$Ni[1],%rax
   1257 	adc	%rdx,$A0[1]
   1258 
   1259 	mov	24($nptr,$j),$Ni[1]	# n[3]
   1260 	xor	$A1[0],$A1[0]
   1261 	add	$A0[0],$A1[1]
   1262 	adc	\$0,$A1[0]
   1263 	mul	$m1			# n[1]*m1
   1264 	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
   1265 	 mov	$Ni[1],%rax
   1266 	adc	%rdx,$A1[0]
   1267 	mov	$A1[1],16($tptr,$j)	# "t[2]"
   1268 
   1269 	xor	$A0[0],$A0[0]
   1270 	add	24($tptr,$j),$A0[1]
   1271 	lea	32($j),$j
   1272 	adc	\$0,$A0[0]
   1273 	mul	$m0			# n[3]*m0
   1274 	add	%rax,$A0[1]		# n[3]*m0+t[3]
   1275 	 mov	$Ni[0],%rax
   1276 	adc	%rdx,$A0[0]
   1277 	jmp	.Lsqr4x_mont_inner
   1278 
   1279 .align	16
   1280 .Lsqr4x_mont_inner:
   1281 	mov	($nptr,$j),$Ni[0]	# n[4]
   1282 	xor	$A1[1],$A1[1]
   1283 	add	$A0[1],$A1[0]
   1284 	adc	\$0,$A1[1]
   1285 	mul	$m1			# n[2]*m1
   1286 	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
   1287 	 mov	$Ni[0],%rax
   1288 	adc	%rdx,$A1[1]
   1289 	mov	$A1[0],-8($tptr,$j)	# "t[3]"
   1290 
   1291 	xor	$A0[1],$A0[1]
   1292 	add	($tptr,$j),$A0[0]
   1293 	adc	\$0,$A0[1]
   1294 	mul	$m0			# n[4]*m0
   1295 	add	%rax,$A0[0]		# n[4]*m0+t[4]
   1296 	 mov	$Ni[1],%rax
   1297 	adc	%rdx,$A0[1]
   1298 
   1299 	mov	8($nptr,$j),$Ni[1]	# n[5]
   1300 	xor	$A1[0],$A1[0]
   1301 	add	$A0[0],$A1[1]
   1302 	adc	\$0,$A1[0]
   1303 	mul	$m1			# n[3]*m1
   1304 	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
   1305 	 mov	$Ni[1],%rax
   1306 	adc	%rdx,$A1[0]
   1307 	mov	$A1[1],($tptr,$j)	# "t[4]"
   1308 
   1309 	xor	$A0[0],$A0[0]
   1310 	add	8($tptr,$j),$A0[1]
   1311 	adc	\$0,$A0[0]
   1312 	mul	$m0			# n[5]*m0
   1313 	add	%rax,$A0[1]		# n[5]*m0+t[5]
   1314 	 mov	$Ni[0],%rax
   1315 	adc	%rdx,$A0[0]
   1316 
   1317 
   1318 	mov	16($nptr,$j),$Ni[0]	# n[6]
   1319 	xor	$A1[1],$A1[1]
   1320 	add	$A0[1],$A1[0]
   1321 	adc	\$0,$A1[1]
   1322 	mul	$m1			# n[4]*m1
   1323 	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
   1324 	 mov	$Ni[0],%rax
   1325 	adc	%rdx,$A1[1]
   1326 	mov	$A1[0],8($tptr,$j)	# "t[5]"
   1327 
   1328 	xor	$A0[1],$A0[1]
   1329 	add	16($tptr,$j),$A0[0]
   1330 	adc	\$0,$A0[1]
   1331 	mul	$m0			# n[6]*m0
   1332 	add	%rax,$A0[0]		# n[6]*m0+t[6]
   1333 	 mov	$Ni[1],%rax
   1334 	adc	%rdx,$A0[1]
   1335 
   1336 	mov	24($nptr,$j),$Ni[1]	# n[7]
   1337 	xor	$A1[0],$A1[0]
   1338 	add	$A0[0],$A1[1]
   1339 	adc	\$0,$A1[0]
   1340 	mul	$m1			# n[5]*m1
   1341 	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
   1342 	 mov	$Ni[1],%rax
   1343 	adc	%rdx,$A1[0]
   1344 	mov	$A1[1],16($tptr,$j)	# "t[6]"
   1345 
   1346 	xor	$A0[0],$A0[0]
   1347 	add	24($tptr,$j),$A0[1]
   1348 	lea	32($j),$j
   1349 	adc	\$0,$A0[0]
   1350 	mul	$m0			# n[7]*m0
   1351 	add	%rax,$A0[1]		# n[7]*m0+t[7]
   1352 	 mov	$Ni[0],%rax
   1353 	adc	%rdx,$A0[0]
   1354 	cmp	\$0,$j
   1355 	jne	.Lsqr4x_mont_inner
   1356 
   1357 	 sub	0(%rsp),$j		# $j=-$num	# modsched #
   1358 	 mov	$n0,$m0			#		# modsched #
   1359 
   1360 	xor	$A1[1],$A1[1]
   1361 	add	$A0[1],$A1[0]
   1362 	adc	\$0,$A1[1]
   1363 	mul	$m1			# n[6]*m1
   1364 	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
   1365 	mov	$Ni[1],%rax
   1366 	adc	%rdx,$A1[1]
   1367 	mov	$A1[0],-8($tptr)	# "t[7]"
   1368 
   1369 	xor	$A0[1],$A0[1]
   1370 	add	($tptr),$A0[0]		# +t[8]
   1371 	adc	\$0,$A0[1]
   1372 	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
   1373 	add	$topbit,$A0[0]
   1374 	adc	\$0,$A0[1]
   1375 
   1376 	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
   1377 	xor	$A1[0],$A1[0]
   1378 	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
   1379 	add	$A0[0],$A1[1]
   1380 	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
   1381 	adc	\$0,$A1[0]
   1382 	mul	$m1			# n[7]*m1
   1383 	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
   1384 	 mov	$Ni[0],%rax		#		# modsched #
   1385 	adc	%rdx,$A1[0]
   1386 	mov	$A1[1],($tptr)		# "t[8]"
   1387 
   1388 	xor	$topbit,$topbit
   1389 	add	8($tptr),$A1[0]		# +t[9]
   1390 	adc	$topbit,$topbit
   1391 	add	$A0[1],$A1[0]
   1392 	lea	16($tptr),$tptr		# "t[$num]>>128"
   1393 	adc	\$0,$topbit
   1394 	mov	$A1[0],-8($tptr)	# "t[9]"
   1395 	cmp	8(%rsp),$tptr		# are we done?
   1396 	jb	.Lsqr4x_mont_outer
   1397 
   1398 	mov	0(%rsp),$num		# restore $num
   1399 	mov	$topbit,($tptr)		# save $topbit
   1400 ___
   1401 }
   1403 ##############################################################
   1404 # Post-condition, 4x unrolled copy from bn_mul_mont
   1405 #
   1406 {
   1407 my ($tptr,$nptr)=("%rbx",$aptr);
   1408 my @ri=("%rax","%rdx","%r10","%r11");
   1409 $code.=<<___;
   1410 	mov	64(%rsp,$num),@ri[0]	# tp[0]
   1411 	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
   1412 	mov	40(%rsp),$nptr		# restore $nptr
   1413 	shr	\$5,$num		# num/4
   1414 	mov	8($tptr),@ri[1]		# t[1]
   1415 	xor	$i,$i			# i=0 and clear CF!
   1416 
   1417 	mov	32(%rsp),$rptr		# restore $rptr
   1418 	sub	0($nptr),@ri[0]
   1419 	mov	16($tptr),@ri[2]	# t[2]
   1420 	mov	24($tptr),@ri[3]	# t[3]
   1421 	sbb	8($nptr),@ri[1]
   1422 	lea	-1($num),$j		# j=num/4-1
   1423 	jmp	.Lsqr4x_sub
   1424 .align	16
   1425 .Lsqr4x_sub:
   1426 	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1427 	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1428 	sbb	16($nptr,$i,8),@ri[2]
   1429 	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
   1430 	mov	40($tptr,$i,8),@ri[1]
   1431 	sbb	24($nptr,$i,8),@ri[3]
   1432 	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1433 	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1434 	sbb	32($nptr,$i,8),@ri[0]
   1435 	mov	48($tptr,$i,8),@ri[2]
   1436 	mov	56($tptr,$i,8),@ri[3]
   1437 	sbb	40($nptr,$i,8),@ri[1]
   1438 	lea	4($i),$i		# i++
   1439 	dec	$j			# doesn't affect CF!
   1440 	jnz	.Lsqr4x_sub
   1441 
   1442 	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1443 	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
   1444 	sbb	16($nptr,$i,8),@ri[2]
   1445 	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1446 	sbb	24($nptr,$i,8),@ri[3]
   1447 	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1448 
   1449 	sbb	\$0,@ri[0]		# handle upmost overflow bit
   1450 	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
   1451 	xor	$i,$i			# i=0
   1452 	and	@ri[0],$tptr
   1453 	not	@ri[0]
   1454 	mov	$rptr,$nptr
   1455 	and	@ri[0],$nptr
   1456 	lea	-1($num),$j
   1457 	or	$nptr,$tptr		# tp=borrow?tp:rp
   1458 
   1459 	pxor	%xmm0,%xmm0
   1460 	lea	64(%rsp,$num,8),$nptr
   1461 	movdqu	($tptr),%xmm1
   1462 	lea	($nptr,$num,8),$nptr
   1463 	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
   1464 	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
   1465 	movdqu	%xmm1,($rptr)
   1466 	jmp	.Lsqr4x_copy
   1467 .align	16
   1468 .Lsqr4x_copy:				# copy or in-place refresh
   1469 	movdqu	16($tptr,$i),%xmm2
   1470 	movdqu	32($tptr,$i),%xmm1
   1471 	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
   1472 	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
   1473 	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
   1474 	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
   1475 	movdqu	%xmm2,16($rptr,$i)
   1476 	movdqu	%xmm1,32($rptr,$i)
   1477 	lea	32($i),$i
   1478 	dec	$j
   1479 	jnz	.Lsqr4x_copy
   1480 
   1481 	movdqu	16($tptr,$i),%xmm2
   1482 	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
   1483 	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
   1484 	movdqu	%xmm2,16($rptr,$i)
   1485 ___
   1486 }
   1487 $code.=<<___;
   1488 	mov	56(%rsp),%rsi		# restore %rsp
   1489 	mov	\$1,%rax
   1490 	mov	0(%rsi),%r15
   1491 	mov	8(%rsi),%r14
   1492 	mov	16(%rsi),%r13
   1493 	mov	24(%rsi),%r12
   1494 	mov	32(%rsi),%rbp
   1495 	mov	40(%rsi),%rbx
   1496 	lea	48(%rsi),%rsp
   1497 .Lsqr4x_epilogue:
   1498 	ret
   1499 .size	bn_sqr4x_mont,.-bn_sqr4x_mont
   1500 ___
   1501 }}}
   1502 $code.=<<___;
   1503 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1504 .align	16
   1505 ___
   1506 
   1507 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1508 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1509 if ($win64) {
   1510 $rec="%rcx";
   1511 $frame="%rdx";
   1512 $context="%r8";
   1513 $disp="%r9";
   1514 
   1515 $code.=<<___;
   1516 .extern	__imp_RtlVirtualUnwind
   1517 .type	mul_handler,\@abi-omnipotent
   1518 .align	16
   1519 mul_handler:
   1520 	push	%rsi
   1521 	push	%rdi
   1522 	push	%rbx
   1523 	push	%rbp
   1524 	push	%r12
   1525 	push	%r13
   1526 	push	%r14
   1527 	push	%r15
   1528 	pushfq
   1529 	sub	\$64,%rsp
   1530 
   1531 	mov	120($context),%rax	# pull context->Rax
   1532 	mov	248($context),%rbx	# pull context->Rip
   1533 
   1534 	mov	8($disp),%rsi		# disp->ImageBase
   1535 	mov	56($disp),%r11		# disp->HandlerData
   1536 
   1537 	mov	0(%r11),%r10d		# HandlerData[0]
   1538 	lea	(%rsi,%r10),%r10	# end of prologue label
   1539 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   1540 	jb	.Lcommon_seh_tail
   1541 
   1542 	mov	152($context),%rax	# pull context->Rsp
   1543 
   1544 	mov	4(%r11),%r10d		# HandlerData[1]
   1545 	lea	(%rsi,%r10),%r10	# epilogue label
   1546 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1547 	jae	.Lcommon_seh_tail
   1548 
   1549 	mov	192($context),%r10	# pull $num
   1550 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
   1551 	lea	48(%rax),%rax
   1552 
   1553 	mov	-8(%rax),%rbx
   1554 	mov	-16(%rax),%rbp
   1555 	mov	-24(%rax),%r12
   1556 	mov	-32(%rax),%r13
   1557 	mov	-40(%rax),%r14
   1558 	mov	-48(%rax),%r15
   1559 	mov	%rbx,144($context)	# restore context->Rbx
   1560 	mov	%rbp,160($context)	# restore context->Rbp
   1561 	mov	%r12,216($context)	# restore context->R12
   1562 	mov	%r13,224($context)	# restore context->R13
   1563 	mov	%r14,232($context)	# restore context->R14
   1564 	mov	%r15,240($context)	# restore context->R15
   1565 
   1566 	jmp	.Lcommon_seh_tail
   1567 .size	mul_handler,.-mul_handler
   1568 
   1569 .type	sqr_handler,\@abi-omnipotent
   1570 .align	16
   1571 sqr_handler:
   1572 	push	%rsi
   1573 	push	%rdi
   1574 	push	%rbx
   1575 	push	%rbp
   1576 	push	%r12
   1577 	push	%r13
   1578 	push	%r14
   1579 	push	%r15
   1580 	pushfq
   1581 	sub	\$64,%rsp
   1582 
   1583 	mov	120($context),%rax	# pull context->Rax
   1584 	mov	248($context),%rbx	# pull context->Rip
   1585 
   1586 	lea	.Lsqr4x_body(%rip),%r10
   1587 	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
   1588 	jb	.Lcommon_seh_tail
   1589 
   1590 	mov	152($context),%rax	# pull context->Rsp
   1591 
   1592 	lea	.Lsqr4x_epilogue(%rip),%r10
   1593 	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
   1594 	jae	.Lcommon_seh_tail
   1595 
   1596 	mov	56(%rax),%rax		# pull saved stack pointer
   1597 	lea	48(%rax),%rax
   1598 
   1599 	mov	-8(%rax),%rbx
   1600 	mov	-16(%rax),%rbp
   1601 	mov	-24(%rax),%r12
   1602 	mov	-32(%rax),%r13
   1603 	mov	-40(%rax),%r14
   1604 	mov	-48(%rax),%r15
   1605 	mov	%rbx,144($context)	# restore context->Rbx
   1606 	mov	%rbp,160($context)	# restore context->Rbp
   1607 	mov	%r12,216($context)	# restore context->R12
   1608 	mov	%r13,224($context)	# restore context->R13
   1609 	mov	%r14,232($context)	# restore context->R14
   1610 	mov	%r15,240($context)	# restore context->R15
   1611 
   1612 .Lcommon_seh_tail:
   1613 	mov	8(%rax),%rdi
   1614 	mov	16(%rax),%rsi
   1615 	mov	%rax,152($context)	# restore context->Rsp
   1616 	mov	%rsi,168($context)	# restore context->Rsi
   1617 	mov	%rdi,176($context)	# restore context->Rdi
   1618 
   1619 	mov	40($disp),%rdi		# disp->ContextRecord
   1620 	mov	$context,%rsi		# context
   1621 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1622 	.long	0xa548f3fc		# cld; rep movsq
   1623 
   1624 	mov	$disp,%rsi
   1625 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1626 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1627 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1628 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1629 	mov	40(%rsi),%r10		# disp->ContextRecord
   1630 	lea	56(%rsi),%r11		# &disp->HandlerData
   1631 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1632 	mov	%r10,32(%rsp)		# arg5
   1633 	mov	%r11,40(%rsp)		# arg6
   1634 	mov	%r12,48(%rsp)		# arg7
   1635 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1636 	call	*__imp_RtlVirtualUnwind(%rip)
   1637 
   1638 	mov	\$1,%eax		# ExceptionContinueSearch
   1639 	add	\$64,%rsp
   1640 	popfq
   1641 	pop	%r15
   1642 	pop	%r14
   1643 	pop	%r13
   1644 	pop	%r12
   1645 	pop	%rbp
   1646 	pop	%rbx
   1647 	pop	%rdi
   1648 	pop	%rsi
   1649 	ret
   1650 .size	sqr_handler,.-sqr_handler
   1651 
   1652 .section	.pdata
   1653 .align	4
   1654 	.rva	.LSEH_begin_bn_mul_mont
   1655 	.rva	.LSEH_end_bn_mul_mont
   1656 	.rva	.LSEH_info_bn_mul_mont
   1657 
   1658 	.rva	.LSEH_begin_bn_mul4x_mont
   1659 	.rva	.LSEH_end_bn_mul4x_mont
   1660 	.rva	.LSEH_info_bn_mul4x_mont
   1661 
   1662 	.rva	.LSEH_begin_bn_sqr4x_mont
   1663 	.rva	.LSEH_end_bn_sqr4x_mont
   1664 	.rva	.LSEH_info_bn_sqr4x_mont
   1665 
   1666 .section	.xdata
   1667 .align	8
   1668 .LSEH_info_bn_mul_mont:
   1669 	.byte	9,0,0,0
   1670 	.rva	mul_handler
   1671 	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
   1672 .LSEH_info_bn_mul4x_mont:
   1673 	.byte	9,0,0,0
   1674 	.rva	mul_handler
   1675 	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1676 .LSEH_info_bn_sqr4x_mont:
   1677 	.byte	9,0,0,0
   1678 	.rva	sqr_handler
   1679 ___
   1680 }
   1681 
   1682 print $code;
   1683 close STDOUT;
   1684