Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # August 2011.
     11 #
     12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
     13 # countermeasures. The subroutines are produced by replacing bp[i]
     14 # references in their x86_64-mont.pl counterparts with cache-neutral
     15 # references to powers table computed in BN_mod_exp_mont_consttime.
     16 # In addition subroutine that scatters elements of the powers table
     17 # is implemented, so that scatter-/gathering can be tuned without
     18 # bn_exp.c modifications.
     19 
     20 $flavour = shift;
     21 $output  = shift;
     22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     23 
     24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     25 
     26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     29 die "can't locate x86_64-xlate.pl";
     30 
     31 open STDOUT,"| $^X $xlate $flavour $output";
     32 
     33 # int bn_mul_mont_gather5(
     34 $rp="%rdi";	# BN_ULONG *rp,
     35 $ap="%rsi";	# const BN_ULONG *ap,
     36 $bp="%rdx";	# const BN_ULONG *bp,
     37 $np="%rcx";	# const BN_ULONG *np,
     38 $n0="%r8";	# const BN_ULONG *n0,
     39 $num="%r9";	# int num,
     40 		# int idx);	# 0 to 2^5-1, "index" in $bp holding
     41 				# pre-computed powers of a', interlaced
     42 				# in such manner that b[0] is $bp[idx],
     43 				# b[1] is [2^5+idx], etc.
     44 $lo0="%r10";
     45 $hi0="%r11";
     46 $hi1="%r13";
     47 $i="%r14";
     48 $j="%r15";
     49 $m0="%rbx";
     50 $m1="%rbp";
     51 
     52 $code=<<___;
     53 .text
     54 
     55 .globl	bn_mul_mont_gather5
     56 .type	bn_mul_mont_gather5,\@function,6
     57 .align	64
     58 bn_mul_mont_gather5:
     59 	test	\$3,${num}d
     60 	jnz	.Lmul_enter
     61 	cmp	\$8,${num}d
     62 	jb	.Lmul_enter
     63 	jmp	.Lmul4x_enter
     64 
     65 .align	16
     66 .Lmul_enter:
     67 	mov	${num}d,${num}d
     68 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
     69 	push	%rbx
     70 	push	%rbp
     71 	push	%r12
     72 	push	%r13
     73 	push	%r14
     74 	push	%r15
     75 ___
     76 $code.=<<___ if ($win64);
     77 	lea	-0x28(%rsp),%rsp
     78 	movaps	%xmm6,(%rsp)
     79 	movaps	%xmm7,0x10(%rsp)
     80 .Lmul_alloca:
     81 ___
     82 $code.=<<___;
     83 	mov	%rsp,%rax
     84 	lea	2($num),%r11
     85 	neg	%r11
     86 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
     87 	and	\$-1024,%rsp		# minimize TLB usage
     88 
     89 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
     90 .Lmul_body:
     91 	mov	$bp,%r12		# reassign $bp
     92 ___
     93 		$bp="%r12";
     94 		$STRIDE=2**5*8;		# 5 is "window size"
     95 		$N=$STRIDE/4;		# should match cache line size
     96 $code.=<<___;
     97 	mov	%r10,%r11
     98 	shr	\$`log($N/8)/log(2)`,%r10
     99 	and	\$`$N/8-1`,%r11
    100 	not	%r10
    101 	lea	.Lmagic_masks(%rip),%rax
    102 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    103 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    104 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    105 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    106 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    107 	movq	24(%rax,%r10,8),%xmm7
    108 
    109 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    110 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    111 	pand	%xmm4,%xmm0
    112 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    113 	pand	%xmm5,%xmm1
    114 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    115 	pand	%xmm6,%xmm2
    116 	por	%xmm1,%xmm0
    117 	pand	%xmm7,%xmm3
    118 	por	%xmm2,%xmm0
    119 	lea	$STRIDE($bp),$bp
    120 	por	%xmm3,%xmm0
    121 
    122 	movq	%xmm0,$m0		# m0=bp[0]
    123 
    124 	mov	($n0),$n0		# pull n0[0] value
    125 	mov	($ap),%rax
    126 
    127 	xor	$i,$i			# i=0
    128 	xor	$j,$j			# j=0
    129 
    130 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    131 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    132 	pand	%xmm4,%xmm0
    133 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    134 	pand	%xmm5,%xmm1
    135 
    136 	mov	$n0,$m1
    137 	mulq	$m0			# ap[0]*bp[0]
    138 	mov	%rax,$lo0
    139 	mov	($np),%rax
    140 
    141 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    142 	pand	%xmm6,%xmm2
    143 	por	%xmm1,%xmm0
    144 	pand	%xmm7,%xmm3
    145 
    146 	imulq	$lo0,$m1		# "tp[0]"*n0
    147 	mov	%rdx,$hi0
    148 
    149 	por	%xmm2,%xmm0
    150 	lea	$STRIDE($bp),$bp
    151 	por	%xmm3,%xmm0
    152 
    153 	mulq	$m1			# np[0]*m1
    154 	add	%rax,$lo0		# discarded
    155 	mov	8($ap),%rax
    156 	adc	\$0,%rdx
    157 	mov	%rdx,$hi1
    158 
    159 	lea	1($j),$j		# j++
    160 	jmp	.L1st_enter
    161 
    162 .align	16
    163 .L1st:
    164 	add	%rax,$hi1
    165 	mov	($ap,$j,8),%rax
    166 	adc	\$0,%rdx
    167 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    168 	mov	$lo0,$hi0
    169 	adc	\$0,%rdx
    170 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    171 	mov	%rdx,$hi1
    172 
    173 .L1st_enter:
    174 	mulq	$m0			# ap[j]*bp[0]
    175 	add	%rax,$hi0
    176 	mov	($np,$j,8),%rax
    177 	adc	\$0,%rdx
    178 	lea	1($j),$j		# j++
    179 	mov	%rdx,$lo0
    180 
    181 	mulq	$m1			# np[j]*m1
    182 	cmp	$num,$j
    183 	jne	.L1st
    184 
    185 	movq	%xmm0,$m0		# bp[1]
    186 
    187 	add	%rax,$hi1
    188 	mov	($ap),%rax		# ap[0]
    189 	adc	\$0,%rdx
    190 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    191 	adc	\$0,%rdx
    192 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    193 	mov	%rdx,$hi1
    194 	mov	$lo0,$hi0
    195 
    196 	xor	%rdx,%rdx
    197 	add	$hi0,$hi1
    198 	adc	\$0,%rdx
    199 	mov	$hi1,-8(%rsp,$num,8)
    200 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    201 
    202 	lea	1($i),$i		# i++
    203 	jmp	.Louter
    204 .align	16
    205 .Louter:
    206 	xor	$j,$j			# j=0
    207 	mov	$n0,$m1
    208 	mov	(%rsp),$lo0
    209 
    210 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    211 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    212 	pand	%xmm4,%xmm0
    213 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    214 	pand	%xmm5,%xmm1
    215 
    216 	mulq	$m0			# ap[0]*bp[i]
    217 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    218 	mov	($np),%rax
    219 	adc	\$0,%rdx
    220 
    221 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    222 	pand	%xmm6,%xmm2
    223 	por	%xmm1,%xmm0
    224 	pand	%xmm7,%xmm3
    225 
    226 	imulq	$lo0,$m1		# tp[0]*n0
    227 	mov	%rdx,$hi0
    228 
    229 	por	%xmm2,%xmm0
    230 	lea	$STRIDE($bp),$bp
    231 	por	%xmm3,%xmm0
    232 
    233 	mulq	$m1			# np[0]*m1
    234 	add	%rax,$lo0		# discarded
    235 	mov	8($ap),%rax
    236 	adc	\$0,%rdx
    237 	mov	8(%rsp),$lo0		# tp[1]
    238 	mov	%rdx,$hi1
    239 
    240 	lea	1($j),$j		# j++
    241 	jmp	.Linner_enter
    242 
    243 .align	16
    244 .Linner:
    245 	add	%rax,$hi1
    246 	mov	($ap,$j,8),%rax
    247 	adc	\$0,%rdx
    248 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    249 	mov	(%rsp,$j,8),$lo0
    250 	adc	\$0,%rdx
    251 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    252 	mov	%rdx,$hi1
    253 
    254 .Linner_enter:
    255 	mulq	$m0			# ap[j]*bp[i]
    256 	add	%rax,$hi0
    257 	mov	($np,$j,8),%rax
    258 	adc	\$0,%rdx
    259 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    260 	mov	%rdx,$hi0
    261 	adc	\$0,$hi0
    262 	lea	1($j),$j		# j++
    263 
    264 	mulq	$m1			# np[j]*m1
    265 	cmp	$num,$j
    266 	jne	.Linner
    267 
    268 	movq	%xmm0,$m0		# bp[i+1]
    269 
    270 	add	%rax,$hi1
    271 	mov	($ap),%rax		# ap[0]
    272 	adc	\$0,%rdx
    273 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    274 	mov	(%rsp,$j,8),$lo0
    275 	adc	\$0,%rdx
    276 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    277 	mov	%rdx,$hi1
    278 
    279 	xor	%rdx,%rdx
    280 	add	$hi0,$hi1
    281 	adc	\$0,%rdx
    282 	add	$lo0,$hi1		# pull upmost overflow bit
    283 	adc	\$0,%rdx
    284 	mov	$hi1,-8(%rsp,$num,8)
    285 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    286 
    287 	lea	1($i),$i		# i++
    288 	cmp	$num,$i
    289 	jl	.Louter
    290 
    291 	xor	$i,$i			# i=0 and clear CF!
    292 	mov	(%rsp),%rax		# tp[0]
    293 	lea	(%rsp),$ap		# borrow ap for tp
    294 	mov	$num,$j			# j=num
    295 	jmp	.Lsub
    296 .align	16
    297 .Lsub:	sbb	($np,$i,8),%rax
    298 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    299 	mov	8($ap,$i,8),%rax	# tp[i+1]
    300 	lea	1($i),$i		# i++
    301 	dec	$j			# doesnn't affect CF!
    302 	jnz	.Lsub
    303 
    304 	sbb	\$0,%rax		# handle upmost overflow bit
    305 	xor	$i,$i
    306 	and	%rax,$ap
    307 	not	%rax
    308 	mov	$rp,$np
    309 	and	%rax,$np
    310 	mov	$num,$j			# j=num
    311 	or	$np,$ap			# ap=borrow?tp:rp
    312 .align	16
    313 .Lcopy:					# copy or in-place refresh
    314 	mov	($ap,$i,8),%rax
    315 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    316 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    317 	lea	1($i),$i
    318 	sub	\$1,$j
    319 	jnz	.Lcopy
    320 
    321 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    322 	mov	\$1,%rax
    323 ___
    324 $code.=<<___ if ($win64);
    325 	movaps	(%rsi),%xmm6
    326 	movaps	0x10(%rsi),%xmm7
    327 	lea	0x28(%rsi),%rsi
    328 ___
    329 $code.=<<___;
    330 	mov	(%rsi),%r15
    331 	mov	8(%rsi),%r14
    332 	mov	16(%rsi),%r13
    333 	mov	24(%rsi),%r12
    334 	mov	32(%rsi),%rbp
    335 	mov	40(%rsi),%rbx
    336 	lea	48(%rsi),%rsp
    337 .Lmul_epilogue:
    338 	ret
    339 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    340 ___
    341 {{{
    342 my @A=("%r10","%r11");
    343 my @N=("%r13","%rdi");
    344 $code.=<<___;
    345 .type	bn_mul4x_mont_gather5,\@function,6
    346 .align	16
    347 bn_mul4x_mont_gather5:
    348 .Lmul4x_enter:
    349 	mov	${num}d,${num}d
    350 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
    351 	push	%rbx
    352 	push	%rbp
    353 	push	%r12
    354 	push	%r13
    355 	push	%r14
    356 	push	%r15
    357 ___
    358 $code.=<<___ if ($win64);
    359 	lea	-0x28(%rsp),%rsp
    360 	movaps	%xmm6,(%rsp)
    361 	movaps	%xmm7,0x10(%rsp)
    362 .Lmul4x_alloca:
    363 ___
    364 $code.=<<___;
    365 	mov	%rsp,%rax
    366 	lea	4($num),%r11
    367 	neg	%r11
    368 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
    369 	and	\$-1024,%rsp		# minimize TLB usage
    370 
    371 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    372 .Lmul4x_body:
    373 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    374 	mov	%rdx,%r12		# reassign $bp
    375 ___
    376 		$bp="%r12";
    377 		$STRIDE=2**5*8;		# 5 is "window size"
    378 		$N=$STRIDE/4;		# should match cache line size
    379 $code.=<<___;
    380 	mov	%r10,%r11
    381 	shr	\$`log($N/8)/log(2)`,%r10
    382 	and	\$`$N/8-1`,%r11
    383 	not	%r10
    384 	lea	.Lmagic_masks(%rip),%rax
    385 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    386 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    387 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    388 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    389 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    390 	movq	24(%rax,%r10,8),%xmm7
    391 
    392 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    393 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    394 	pand	%xmm4,%xmm0
    395 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    396 	pand	%xmm5,%xmm1
    397 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    398 	pand	%xmm6,%xmm2
    399 	por	%xmm1,%xmm0
    400 	pand	%xmm7,%xmm3
    401 	por	%xmm2,%xmm0
    402 	lea	$STRIDE($bp),$bp
    403 	por	%xmm3,%xmm0
    404 
    405 	movq	%xmm0,$m0		# m0=bp[0]
    406 	mov	($n0),$n0		# pull n0[0] value
    407 	mov	($ap),%rax
    408 
    409 	xor	$i,$i			# i=0
    410 	xor	$j,$j			# j=0
    411 
    412 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    413 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    414 	pand	%xmm4,%xmm0
    415 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    416 	pand	%xmm5,%xmm1
    417 
    418 	mov	$n0,$m1
    419 	mulq	$m0			# ap[0]*bp[0]
    420 	mov	%rax,$A[0]
    421 	mov	($np),%rax
    422 
    423 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    424 	pand	%xmm6,%xmm2
    425 	por	%xmm1,%xmm0
    426 	pand	%xmm7,%xmm3
    427 
    428 	imulq	$A[0],$m1		# "tp[0]"*n0
    429 	mov	%rdx,$A[1]
    430 
    431 	por	%xmm2,%xmm0
    432 	lea	$STRIDE($bp),$bp
    433 	por	%xmm3,%xmm0
    434 
    435 	mulq	$m1			# np[0]*m1
    436 	add	%rax,$A[0]		# discarded
    437 	mov	8($ap),%rax
    438 	adc	\$0,%rdx
    439 	mov	%rdx,$N[1]
    440 
    441 	mulq	$m0
    442 	add	%rax,$A[1]
    443 	mov	8($np),%rax
    444 	adc	\$0,%rdx
    445 	mov	%rdx,$A[0]
    446 
    447 	mulq	$m1
    448 	add	%rax,$N[1]
    449 	mov	16($ap),%rax
    450 	adc	\$0,%rdx
    451 	add	$A[1],$N[1]
    452 	lea	4($j),$j		# j++
    453 	adc	\$0,%rdx
    454 	mov	$N[1],(%rsp)
    455 	mov	%rdx,$N[0]
    456 	jmp	.L1st4x
    457 .align	16
    458 .L1st4x:
    459 	mulq	$m0			# ap[j]*bp[0]
    460 	add	%rax,$A[0]
    461 	mov	-16($np,$j,8),%rax
    462 	adc	\$0,%rdx
    463 	mov	%rdx,$A[1]
    464 
    465 	mulq	$m1			# np[j]*m1
    466 	add	%rax,$N[0]
    467 	mov	-8($ap,$j,8),%rax
    468 	adc	\$0,%rdx
    469 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    470 	adc	\$0,%rdx
    471 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    472 	mov	%rdx,$N[1]
    473 
    474 	mulq	$m0			# ap[j]*bp[0]
    475 	add	%rax,$A[1]
    476 	mov	-8($np,$j,8),%rax
    477 	adc	\$0,%rdx
    478 	mov	%rdx,$A[0]
    479 
    480 	mulq	$m1			# np[j]*m1
    481 	add	%rax,$N[1]
    482 	mov	($ap,$j,8),%rax
    483 	adc	\$0,%rdx
    484 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    485 	adc	\$0,%rdx
    486 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    487 	mov	%rdx,$N[0]
    488 
    489 	mulq	$m0			# ap[j]*bp[0]
    490 	add	%rax,$A[0]
    491 	mov	($np,$j,8),%rax
    492 	adc	\$0,%rdx
    493 	mov	%rdx,$A[1]
    494 
    495 	mulq	$m1			# np[j]*m1
    496 	add	%rax,$N[0]
    497 	mov	8($ap,$j,8),%rax
    498 	adc	\$0,%rdx
    499 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    500 	adc	\$0,%rdx
    501 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    502 	mov	%rdx,$N[1]
    503 
    504 	mulq	$m0			# ap[j]*bp[0]
    505 	add	%rax,$A[1]
    506 	mov	8($np,$j,8),%rax
    507 	adc	\$0,%rdx
    508 	lea	4($j),$j		# j++
    509 	mov	%rdx,$A[0]
    510 
    511 	mulq	$m1			# np[j]*m1
    512 	add	%rax,$N[1]
    513 	mov	-16($ap,$j,8),%rax
    514 	adc	\$0,%rdx
    515 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    516 	adc	\$0,%rdx
    517 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    518 	mov	%rdx,$N[0]
    519 	cmp	$num,$j
    520 	jl	.L1st4x
    521 
    522 	mulq	$m0			# ap[j]*bp[0]
    523 	add	%rax,$A[0]
    524 	mov	-16($np,$j,8),%rax
    525 	adc	\$0,%rdx
    526 	mov	%rdx,$A[1]
    527 
    528 	mulq	$m1			# np[j]*m1
    529 	add	%rax,$N[0]
    530 	mov	-8($ap,$j,8),%rax
    531 	adc	\$0,%rdx
    532 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    533 	adc	\$0,%rdx
    534 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    535 	mov	%rdx,$N[1]
    536 
    537 	mulq	$m0			# ap[j]*bp[0]
    538 	add	%rax,$A[1]
    539 	mov	-8($np,$j,8),%rax
    540 	adc	\$0,%rdx
    541 	mov	%rdx,$A[0]
    542 
    543 	mulq	$m1			# np[j]*m1
    544 	add	%rax,$N[1]
    545 	mov	($ap),%rax		# ap[0]
    546 	adc	\$0,%rdx
    547 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    548 	adc	\$0,%rdx
    549 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    550 	mov	%rdx,$N[0]
    551 
    552 	movq	%xmm0,$m0		# bp[1]
    553 
    554 	xor	$N[1],$N[1]
    555 	add	$A[0],$N[0]
    556 	adc	\$0,$N[1]
    557 	mov	$N[0],-8(%rsp,$j,8)
    558 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    559 
    560 	lea	1($i),$i		# i++
    561 .align	4
    562 .Louter4x:
    563 	xor	$j,$j			# j=0
    564 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    565 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    566 	pand	%xmm4,%xmm0
    567 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    568 	pand	%xmm5,%xmm1
    569 
    570 	mov	(%rsp),$A[0]
    571 	mov	$n0,$m1
    572 	mulq	$m0			# ap[0]*bp[i]
    573 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    574 	mov	($np),%rax
    575 	adc	\$0,%rdx
    576 
    577 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    578 	pand	%xmm6,%xmm2
    579 	por	%xmm1,%xmm0
    580 	pand	%xmm7,%xmm3
    581 
    582 	imulq	$A[0],$m1		# tp[0]*n0
    583 	mov	%rdx,$A[1]
    584 
    585 	por	%xmm2,%xmm0
    586 	lea	$STRIDE($bp),$bp
    587 	por	%xmm3,%xmm0
    588 
    589 	mulq	$m1			# np[0]*m1
    590 	add	%rax,$A[0]		# "$N[0]", discarded
    591 	mov	8($ap),%rax
    592 	adc	\$0,%rdx
    593 	mov	%rdx,$N[1]
    594 
    595 	mulq	$m0			# ap[j]*bp[i]
    596 	add	%rax,$A[1]
    597 	mov	8($np),%rax
    598 	adc	\$0,%rdx
    599 	add	8(%rsp),$A[1]		# +tp[1]
    600 	adc	\$0,%rdx
    601 	mov	%rdx,$A[0]
    602 
    603 	mulq	$m1			# np[j]*m1
    604 	add	%rax,$N[1]
    605 	mov	16($ap),%rax
    606 	adc	\$0,%rdx
    607 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    608 	lea	4($j),$j		# j+=2
    609 	adc	\$0,%rdx
    610 	mov	%rdx,$N[0]
    611 	jmp	.Linner4x
    612 .align	16
    613 .Linner4x:
    614 	mulq	$m0			# ap[j]*bp[i]
    615 	add	%rax,$A[0]
    616 	mov	-16($np,$j,8),%rax
    617 	adc	\$0,%rdx
    618 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    619 	adc	\$0,%rdx
    620 	mov	%rdx,$A[1]
    621 
    622 	mulq	$m1			# np[j]*m1
    623 	add	%rax,$N[0]
    624 	mov	-8($ap,$j,8),%rax
    625 	adc	\$0,%rdx
    626 	add	$A[0],$N[0]
    627 	adc	\$0,%rdx
    628 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    629 	mov	%rdx,$N[1]
    630 
    631 	mulq	$m0			# ap[j]*bp[i]
    632 	add	%rax,$A[1]
    633 	mov	-8($np,$j,8),%rax
    634 	adc	\$0,%rdx
    635 	add	-8(%rsp,$j,8),$A[1]
    636 	adc	\$0,%rdx
    637 	mov	%rdx,$A[0]
    638 
    639 	mulq	$m1			# np[j]*m1
    640 	add	%rax,$N[1]
    641 	mov	($ap,$j,8),%rax
    642 	adc	\$0,%rdx
    643 	add	$A[1],$N[1]
    644 	adc	\$0,%rdx
    645 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    646 	mov	%rdx,$N[0]
    647 
    648 	mulq	$m0			# ap[j]*bp[i]
    649 	add	%rax,$A[0]
    650 	mov	($np,$j,8),%rax
    651 	adc	\$0,%rdx
    652 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    653 	adc	\$0,%rdx
    654 	mov	%rdx,$A[1]
    655 
    656 	mulq	$m1			# np[j]*m1
    657 	add	%rax,$N[0]
    658 	mov	8($ap,$j,8),%rax
    659 	adc	\$0,%rdx
    660 	add	$A[0],$N[0]
    661 	adc	\$0,%rdx
    662 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    663 	mov	%rdx,$N[1]
    664 
    665 	mulq	$m0			# ap[j]*bp[i]
    666 	add	%rax,$A[1]
    667 	mov	8($np,$j,8),%rax
    668 	adc	\$0,%rdx
    669 	add	8(%rsp,$j,8),$A[1]
    670 	adc	\$0,%rdx
    671 	lea	4($j),$j		# j++
    672 	mov	%rdx,$A[0]
    673 
    674 	mulq	$m1			# np[j]*m1
    675 	add	%rax,$N[1]
    676 	mov	-16($ap,$j,8),%rax
    677 	adc	\$0,%rdx
    678 	add	$A[1],$N[1]
    679 	adc	\$0,%rdx
    680 	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
    681 	mov	%rdx,$N[0]
    682 	cmp	$num,$j
    683 	jl	.Linner4x
    684 
    685 	mulq	$m0			# ap[j]*bp[i]
    686 	add	%rax,$A[0]
    687 	mov	-16($np,$j,8),%rax
    688 	adc	\$0,%rdx
    689 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    690 	adc	\$0,%rdx
    691 	mov	%rdx,$A[1]
    692 
    693 	mulq	$m1			# np[j]*m1
    694 	add	%rax,$N[0]
    695 	mov	-8($ap,$j,8),%rax
    696 	adc	\$0,%rdx
    697 	add	$A[0],$N[0]
    698 	adc	\$0,%rdx
    699 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    700 	mov	%rdx,$N[1]
    701 
    702 	mulq	$m0			# ap[j]*bp[i]
    703 	add	%rax,$A[1]
    704 	mov	-8($np,$j,8),%rax
    705 	adc	\$0,%rdx
    706 	add	-8(%rsp,$j,8),$A[1]
    707 	adc	\$0,%rdx
    708 	lea	1($i),$i		# i++
    709 	mov	%rdx,$A[0]
    710 
    711 	mulq	$m1			# np[j]*m1
    712 	add	%rax,$N[1]
    713 	mov	($ap),%rax		# ap[0]
    714 	adc	\$0,%rdx
    715 	add	$A[1],$N[1]
    716 	adc	\$0,%rdx
    717 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    718 	mov	%rdx,$N[0]
    719 
    720 	movq	%xmm0,$m0		# bp[i+1]
    721 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    722 
    723 	xor	$N[1],$N[1]
    724 	add	$A[0],$N[0]
    725 	adc	\$0,$N[1]
    726 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    727 	adc	\$0,$N[1]
    728 	mov	$N[0],-8(%rsp,$j,8)
    729 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    730 
    731 	cmp	$num,$i
    732 	jl	.Louter4x
    733 ___
    734 {
    735 my @ri=("%rax","%rdx",$m0,$m1);
    736 $code.=<<___;
    737 	mov	16(%rsp,$num,8),$rp	# restore $rp
    738 	mov	0(%rsp),@ri[0]		# tp[0]
    739 	pxor	%xmm0,%xmm0
    740 	mov	8(%rsp),@ri[1]		# tp[1]
    741 	shr	\$2,$num		# num/=4
    742 	lea	(%rsp),$ap		# borrow ap for tp
    743 	xor	$i,$i			# i=0 and clear CF!
    744 
    745 	sub	0($np),@ri[0]
    746 	mov	16($ap),@ri[2]		# tp[2]
    747 	mov	24($ap),@ri[3]		# tp[3]
    748 	sbb	8($np),@ri[1]
    749 	lea	-1($num),$j		# j=num/4-1
    750 	jmp	.Lsub4x
    751 .align	16
    752 .Lsub4x:
    753 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    754 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    755 	sbb	16($np,$i,8),@ri[2]
    756 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    757 	mov	40($ap,$i,8),@ri[1]
    758 	sbb	24($np,$i,8),@ri[3]
    759 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    760 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    761 	sbb	32($np,$i,8),@ri[0]
    762 	mov	48($ap,$i,8),@ri[2]
    763 	mov	56($ap,$i,8),@ri[3]
    764 	sbb	40($np,$i,8),@ri[1]
    765 	lea	4($i),$i		# i++
    766 	dec	$j			# doesnn't affect CF!
    767 	jnz	.Lsub4x
    768 
    769 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    770 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    771 	sbb	16($np,$i,8),@ri[2]
    772 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    773 	sbb	24($np,$i,8),@ri[3]
    774 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    775 
    776 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    777 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    778 	xor	$i,$i			# i=0
    779 	and	@ri[0],$ap
    780 	not	@ri[0]
    781 	mov	$rp,$np
    782 	and	@ri[0],$np
    783 	lea	-1($num),$j
    784 	or	$np,$ap			# ap=borrow?tp:rp
    785 
    786 	movdqu	($ap),%xmm1
    787 	movdqa	%xmm0,(%rsp)
    788 	movdqu	%xmm1,($rp)
    789 	jmp	.Lcopy4x
    790 .align	16
    791 .Lcopy4x:					# copy or in-place refresh
    792 	movdqu	16($ap,$i),%xmm2
    793 	movdqu	32($ap,$i),%xmm1
    794 	movdqa	%xmm0,16(%rsp,$i)
    795 	movdqu	%xmm2,16($rp,$i)
    796 	movdqa	%xmm0,32(%rsp,$i)
    797 	movdqu	%xmm1,32($rp,$i)
    798 	lea	32($i),$i
    799 	dec	$j
    800 	jnz	.Lcopy4x
    801 
    802 	shl	\$2,$num
    803 	movdqu	16($ap,$i),%xmm2
    804 	movdqa	%xmm0,16(%rsp,$i)
    805 	movdqu	%xmm2,16($rp,$i)
    806 ___
    807 }
    808 $code.=<<___;
    809 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    810 	mov	\$1,%rax
    811 ___
    812 $code.=<<___ if ($win64);
    813 	movaps	(%rsi),%xmm6
    814 	movaps	0x10(%rsi),%xmm7
    815 	lea	0x28(%rsi),%rsi
    816 ___
    817 $code.=<<___;
    818 	mov	(%rsi),%r15
    819 	mov	8(%rsi),%r14
    820 	mov	16(%rsi),%r13
    821 	mov	24(%rsi),%r12
    822 	mov	32(%rsi),%rbp
    823 	mov	40(%rsi),%rbx
    824 	lea	48(%rsi),%rsp
    825 .Lmul4x_epilogue:
    826 	ret
    827 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    828 ___
    829 }}}
    830 
    831 {
    832 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
    833 				("%rdi","%rsi","%rdx","%rcx"); # Unix order
    834 my $out=$inp;
    835 my $STRIDE=2**5*8;
    836 my $N=$STRIDE/4;
    837 
    838 $code.=<<___;
    839 .globl	bn_scatter5
    840 .type	bn_scatter5,\@abi-omnipotent
    841 .align	16
    842 bn_scatter5:
    843 	cmp	\$0, $num
    844 	jz	.Lscatter_epilogue
    845 	lea	($tbl,$idx,8),$tbl
    846 .Lscatter:
    847 	mov	($inp),%rax
    848 	lea	8($inp),$inp
    849 	mov	%rax,($tbl)
    850 	lea	32*8($tbl),$tbl
    851 	sub	\$1,$num
    852 	jnz	.Lscatter
    853 .Lscatter_epilogue:
    854 	ret
    855 .size	bn_scatter5,.-bn_scatter5
    856 
    857 .globl	bn_gather5
    858 .type	bn_gather5,\@abi-omnipotent
    859 .align	16
    860 bn_gather5:
    861 ___
    862 $code.=<<___ if ($win64);
    863 .LSEH_begin_bn_gather5:
    864 	# I can't trust assembler to use specific encoding:-(
    865 	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
    866 	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
    867 	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
    868 ___
    869 $code.=<<___;
    870 	mov	$idx,%r11
    871 	shr	\$`log($N/8)/log(2)`,$idx
    872 	and	\$`$N/8-1`,%r11
    873 	not	$idx
    874 	lea	.Lmagic_masks(%rip),%rax
    875 	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
    876 	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
    877 	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
    878 	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
    879 	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
    880 	movq	24(%rax,$idx,8),%xmm7
    881 	jmp	.Lgather
    882 .align	16
    883 .Lgather:
    884 	movq	`0*$STRIDE/4-96`($tbl),%xmm0
    885 	movq	`1*$STRIDE/4-96`($tbl),%xmm1
    886 	pand	%xmm4,%xmm0
    887 	movq	`2*$STRIDE/4-96`($tbl),%xmm2
    888 	pand	%xmm5,%xmm1
    889 	movq	`3*$STRIDE/4-96`($tbl),%xmm3
    890 	pand	%xmm6,%xmm2
    891 	por	%xmm1,%xmm0
    892 	pand	%xmm7,%xmm3
    893 	por	%xmm2,%xmm0
    894 	lea	$STRIDE($tbl),$tbl
    895 	por	%xmm3,%xmm0
    896 
    897 	movq	%xmm0,($out)		# m0=bp[0]
    898 	lea	8($out),$out
    899 	sub	\$1,$num
    900 	jnz	.Lgather
    901 ___
    902 $code.=<<___ if ($win64);
    903 	movaps	%xmm6,(%rsp)
    904 	movaps	%xmm7,0x10(%rsp)
    905 	lea	0x28(%rsp),%rsp
    906 ___
    907 $code.=<<___;
    908 	ret
    909 .LSEH_end_bn_gather5:
    910 .size	bn_gather5,.-bn_gather5
    911 ___
    912 }
    913 $code.=<<___;
    914 .align	64
    915 .Lmagic_masks:
    916 	.long	0,0, 0,0, 0,0, -1,-1
    917 	.long	0,0, 0,0, 0,0,  0,0
    918 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    919 ___
    920 
    921 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
    922 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
    923 if ($win64) {
    924 $rec="%rcx";
    925 $frame="%rdx";
    926 $context="%r8";
    927 $disp="%r9";
    928 
    929 $code.=<<___;
    930 .extern	__imp_RtlVirtualUnwind
    931 .type	mul_handler,\@abi-omnipotent
    932 .align	16
    933 mul_handler:
    934 	push	%rsi
    935 	push	%rdi
    936 	push	%rbx
    937 	push	%rbp
    938 	push	%r12
    939 	push	%r13
    940 	push	%r14
    941 	push	%r15
    942 	pushfq
    943 	sub	\$64,%rsp
    944 
    945 	mov	120($context),%rax	# pull context->Rax
    946 	mov	248($context),%rbx	# pull context->Rip
    947 
    948 	mov	8($disp),%rsi		# disp->ImageBase
    949 	mov	56($disp),%r11		# disp->HandlerData
    950 
    951 	mov	0(%r11),%r10d		# HandlerData[0]
    952 	lea	(%rsi,%r10),%r10	# end of prologue label
    953 	cmp	%r10,%rbx		# context->Rip<end of prologue label
    954 	jb	.Lcommon_seh_tail
    955 
    956 	lea	`40+48`(%rax),%rax
    957 
    958 	mov	4(%r11),%r10d		# HandlerData[1]
    959 	lea	(%rsi,%r10),%r10	# end of alloca label
    960 	cmp	%r10,%rbx		# context->Rip<end of alloca label
    961 	jb	.Lcommon_seh_tail
    962 
    963 	mov	152($context),%rax	# pull context->Rsp
    964 
    965 	mov	8(%r11),%r10d		# HandlerData[2]
    966 	lea	(%rsi,%r10),%r10	# epilogue label
    967 	cmp	%r10,%rbx		# context->Rip>=epilogue label
    968 	jae	.Lcommon_seh_tail
    969 
    970 	mov	192($context),%r10	# pull $num
    971 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
    972 
    973 	movaps	(%rax),%xmm0
    974 	movaps	16(%rax),%xmm1
    975 	lea	`40+48`(%rax),%rax
    976 
    977 	mov	-8(%rax),%rbx
    978 	mov	-16(%rax),%rbp
    979 	mov	-24(%rax),%r12
    980 	mov	-32(%rax),%r13
    981 	mov	-40(%rax),%r14
    982 	mov	-48(%rax),%r15
    983 	mov	%rbx,144($context)	# restore context->Rbx
    984 	mov	%rbp,160($context)	# restore context->Rbp
    985 	mov	%r12,216($context)	# restore context->R12
    986 	mov	%r13,224($context)	# restore context->R13
    987 	mov	%r14,232($context)	# restore context->R14
    988 	mov	%r15,240($context)	# restore context->R15
    989 	movups	%xmm0,512($context)	# restore context->Xmm6
    990 	movups	%xmm1,528($context)	# restore context->Xmm7
    991 
    992 .Lcommon_seh_tail:
    993 	mov	8(%rax),%rdi
    994 	mov	16(%rax),%rsi
    995 	mov	%rax,152($context)	# restore context->Rsp
    996 	mov	%rsi,168($context)	# restore context->Rsi
    997 	mov	%rdi,176($context)	# restore context->Rdi
    998 
    999 	mov	40($disp),%rdi		# disp->ContextRecord
   1000 	mov	$context,%rsi		# context
   1001 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1002 	.long	0xa548f3fc		# cld; rep movsq
   1003 
   1004 	mov	$disp,%rsi
   1005 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1006 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1007 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1008 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1009 	mov	40(%rsi),%r10		# disp->ContextRecord
   1010 	lea	56(%rsi),%r11		# &disp->HandlerData
   1011 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1012 	mov	%r10,32(%rsp)		# arg5
   1013 	mov	%r11,40(%rsp)		# arg6
   1014 	mov	%r12,48(%rsp)		# arg7
   1015 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1016 	call	*__imp_RtlVirtualUnwind(%rip)
   1017 
   1018 	mov	\$1,%eax		# ExceptionContinueSearch
   1019 	add	\$64,%rsp
   1020 	popfq
   1021 	pop	%r15
   1022 	pop	%r14
   1023 	pop	%r13
   1024 	pop	%r12
   1025 	pop	%rbp
   1026 	pop	%rbx
   1027 	pop	%rdi
   1028 	pop	%rsi
   1029 	ret
   1030 .size	mul_handler,.-mul_handler
   1031 
   1032 .section	.pdata
   1033 .align	4
   1034 	.rva	.LSEH_begin_bn_mul_mont_gather5
   1035 	.rva	.LSEH_end_bn_mul_mont_gather5
   1036 	.rva	.LSEH_info_bn_mul_mont_gather5
   1037 
   1038 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
   1039 	.rva	.LSEH_end_bn_mul4x_mont_gather5
   1040 	.rva	.LSEH_info_bn_mul4x_mont_gather5
   1041 
   1042 	.rva	.LSEH_begin_bn_gather5
   1043 	.rva	.LSEH_end_bn_gather5
   1044 	.rva	.LSEH_info_bn_gather5
   1045 
   1046 .section	.xdata
   1047 .align	8
   1048 .LSEH_info_bn_mul_mont_gather5:
   1049 	.byte	9,0,0,0
   1050 	.rva	mul_handler
   1051 	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
   1052 .align	8
   1053 .LSEH_info_bn_mul4x_mont_gather5:
   1054 	.byte	9,0,0,0
   1055 	.rva	mul_handler
   1056 	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1057 .align	8
   1058 .LSEH_info_bn_gather5:
   1059         .byte   0x01,0x0d,0x05,0x00
   1060         .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
   1061         .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
   1062         .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
   1063 .align	8
   1064 ___
   1065 }
   1066 
   1067 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1068 
   1069 print $code;
   1070 close STDOUT;
   1071