Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # August 2011.
     11 #
     12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
     13 # countermeasures. The subroutines are produced by replacing bp[i]
     14 # references in their x86_64-mont.pl counterparts with cache-neutral
     15 # references to powers table computed in BN_mod_exp_mont_consttime.
     16 # In addition subroutine that scatters elements of the powers table
     17 # is implemented, so that scatter-/gathering can be tuned without
     18 # bn_exp.c modifications.
     19 
     20 $flavour = shift;
     21 $output  = shift;
     22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     23 
     24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     25 
     26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     29 die "can't locate x86_64-xlate.pl";
     30 
     31 open OUT,"| \"$^X\" $xlate $flavour $output";
     32 *STDOUT=*OUT;
     33 
     34 # int bn_mul_mont_gather5(
     35 $rp="%rdi";	# BN_ULONG *rp,
     36 $ap="%rsi";	# const BN_ULONG *ap,
     37 $bp="%rdx";	# const BN_ULONG *bp,
     38 $np="%rcx";	# const BN_ULONG *np,
     39 $n0="%r8";	# const BN_ULONG *n0,
     40 $num="%r9";	# int num,
     41 		# int idx);	# 0 to 2^5-1, "index" in $bp holding
     42 				# pre-computed powers of a', interlaced
     43 				# in such manner that b[0] is $bp[idx],
     44 				# b[1] is [2^5+idx], etc.
     45 $lo0="%r10";
     46 $hi0="%r11";
     47 $hi1="%r13";
     48 $i="%r14";
     49 $j="%r15";
     50 $m0="%rbx";
     51 $m1="%rbp";
     52 
     53 $code=<<___;
     54 .text
     55 
     56 .globl	bn_mul_mont_gather5
     57 .type	bn_mul_mont_gather5,\@function,6
     58 .align	64
     59 bn_mul_mont_gather5:
     60 	test	\$3,${num}d
     61 	jnz	.Lmul_enter
     62 	cmp	\$8,${num}d
     63 	jb	.Lmul_enter
     64 	jmp	.Lmul4x_enter
     65 
     66 .align	16
     67 .Lmul_enter:
     68 	mov	${num}d,${num}d
     69 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
     70 	push	%rbx
     71 	push	%rbp
     72 	push	%r12
     73 	push	%r13
     74 	push	%r14
     75 	push	%r15
     76 ___
     77 $code.=<<___ if ($win64);
     78 	lea	-0x28(%rsp),%rsp
     79 	movaps	%xmm6,(%rsp)
     80 	movaps	%xmm7,0x10(%rsp)
     81 .Lmul_alloca:
     82 ___
     83 $code.=<<___;
     84 	mov	%rsp,%rax
     85 	lea	2($num),%r11
     86 	neg	%r11
     87 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
     88 	and	\$-1024,%rsp		# minimize TLB usage
     89 
     90 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
     91 .Lmul_body:
     92 	mov	$bp,%r12		# reassign $bp
     93 ___
     94 		$bp="%r12";
     95 		$STRIDE=2**5*8;		# 5 is "window size"
     96 		$N=$STRIDE/4;		# should match cache line size
     97 $code.=<<___;
     98 	mov	%r10,%r11
     99 	shr	\$`log($N/8)/log(2)`,%r10
    100 	and	\$`$N/8-1`,%r11
    101 	not	%r10
    102 	lea	.Lmagic_masks(%rip),%rax
    103 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    104 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    105 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    106 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    107 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    108 	movq	24(%rax,%r10,8),%xmm7
    109 
    110 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    111 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    112 	pand	%xmm4,%xmm0
    113 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    114 	pand	%xmm5,%xmm1
    115 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    116 	pand	%xmm6,%xmm2
    117 	por	%xmm1,%xmm0
    118 	pand	%xmm7,%xmm3
    119 	por	%xmm2,%xmm0
    120 	lea	$STRIDE($bp),$bp
    121 	por	%xmm3,%xmm0
    122 
    123 	movq	%xmm0,$m0		# m0=bp[0]
    124 
    125 	mov	($n0),$n0		# pull n0[0] value
    126 	mov	($ap),%rax
    127 
    128 	xor	$i,$i			# i=0
    129 	xor	$j,$j			# j=0
    130 
    131 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    132 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    133 	pand	%xmm4,%xmm0
    134 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    135 	pand	%xmm5,%xmm1
    136 
    137 	mov	$n0,$m1
    138 	mulq	$m0			# ap[0]*bp[0]
    139 	mov	%rax,$lo0
    140 	mov	($np),%rax
    141 
    142 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    143 	pand	%xmm6,%xmm2
    144 	por	%xmm1,%xmm0
    145 	pand	%xmm7,%xmm3
    146 
    147 	imulq	$lo0,$m1		# "tp[0]"*n0
    148 	mov	%rdx,$hi0
    149 
    150 	por	%xmm2,%xmm0
    151 	lea	$STRIDE($bp),$bp
    152 	por	%xmm3,%xmm0
    153 
    154 	mulq	$m1			# np[0]*m1
    155 	add	%rax,$lo0		# discarded
    156 	mov	8($ap),%rax
    157 	adc	\$0,%rdx
    158 	mov	%rdx,$hi1
    159 
    160 	lea	1($j),$j		# j++
    161 	jmp	.L1st_enter
    162 
    163 .align	16
    164 .L1st:
    165 	add	%rax,$hi1
    166 	mov	($ap,$j,8),%rax
    167 	adc	\$0,%rdx
    168 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    169 	mov	$lo0,$hi0
    170 	adc	\$0,%rdx
    171 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    172 	mov	%rdx,$hi1
    173 
    174 .L1st_enter:
    175 	mulq	$m0			# ap[j]*bp[0]
    176 	add	%rax,$hi0
    177 	mov	($np,$j,8),%rax
    178 	adc	\$0,%rdx
    179 	lea	1($j),$j		# j++
    180 	mov	%rdx,$lo0
    181 
    182 	mulq	$m1			# np[j]*m1
    183 	cmp	$num,$j
    184 	jne	.L1st
    185 
    186 	movq	%xmm0,$m0		# bp[1]
    187 
    188 	add	%rax,$hi1
    189 	mov	($ap),%rax		# ap[0]
    190 	adc	\$0,%rdx
    191 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    192 	adc	\$0,%rdx
    193 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    194 	mov	%rdx,$hi1
    195 	mov	$lo0,$hi0
    196 
    197 	xor	%rdx,%rdx
    198 	add	$hi0,$hi1
    199 	adc	\$0,%rdx
    200 	mov	$hi1,-8(%rsp,$num,8)
    201 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    202 
    203 	lea	1($i),$i		# i++
    204 	jmp	.Louter
    205 .align	16
    206 .Louter:
    207 	xor	$j,$j			# j=0
    208 	mov	$n0,$m1
    209 	mov	(%rsp),$lo0
    210 
    211 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    212 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    213 	pand	%xmm4,%xmm0
    214 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    215 	pand	%xmm5,%xmm1
    216 
    217 	mulq	$m0			# ap[0]*bp[i]
    218 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    219 	mov	($np),%rax
    220 	adc	\$0,%rdx
    221 
    222 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    223 	pand	%xmm6,%xmm2
    224 	por	%xmm1,%xmm0
    225 	pand	%xmm7,%xmm3
    226 
    227 	imulq	$lo0,$m1		# tp[0]*n0
    228 	mov	%rdx,$hi0
    229 
    230 	por	%xmm2,%xmm0
    231 	lea	$STRIDE($bp),$bp
    232 	por	%xmm3,%xmm0
    233 
    234 	mulq	$m1			# np[0]*m1
    235 	add	%rax,$lo0		# discarded
    236 	mov	8($ap),%rax
    237 	adc	\$0,%rdx
    238 	mov	8(%rsp),$lo0		# tp[1]
    239 	mov	%rdx,$hi1
    240 
    241 	lea	1($j),$j		# j++
    242 	jmp	.Linner_enter
    243 
    244 .align	16
    245 .Linner:
    246 	add	%rax,$hi1
    247 	mov	($ap,$j,8),%rax
    248 	adc	\$0,%rdx
    249 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    250 	mov	(%rsp,$j,8),$lo0
    251 	adc	\$0,%rdx
    252 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    253 	mov	%rdx,$hi1
    254 
    255 .Linner_enter:
    256 	mulq	$m0			# ap[j]*bp[i]
    257 	add	%rax,$hi0
    258 	mov	($np,$j,8),%rax
    259 	adc	\$0,%rdx
    260 	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    261 	mov	%rdx,$hi0
    262 	adc	\$0,$hi0
    263 	lea	1($j),$j		# j++
    264 
    265 	mulq	$m1			# np[j]*m1
    266 	cmp	$num,$j
    267 	jne	.Linner
    268 
    269 	movq	%xmm0,$m0		# bp[i+1]
    270 
    271 	add	%rax,$hi1
    272 	mov	($ap),%rax		# ap[0]
    273 	adc	\$0,%rdx
    274 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    275 	mov	(%rsp,$j,8),$lo0
    276 	adc	\$0,%rdx
    277 	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    278 	mov	%rdx,$hi1
    279 
    280 	xor	%rdx,%rdx
    281 	add	$hi0,$hi1
    282 	adc	\$0,%rdx
    283 	add	$lo0,$hi1		# pull upmost overflow bit
    284 	adc	\$0,%rdx
    285 	mov	$hi1,-8(%rsp,$num,8)
    286 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    287 
    288 	lea	1($i),$i		# i++
    289 	cmp	$num,$i
    290 	jl	.Louter
    291 
    292 	xor	$i,$i			# i=0 and clear CF!
    293 	mov	(%rsp),%rax		# tp[0]
    294 	lea	(%rsp),$ap		# borrow ap for tp
    295 	mov	$num,$j			# j=num
    296 	jmp	.Lsub
    297 .align	16
    298 .Lsub:	sbb	($np,$i,8),%rax
    299 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    300 	mov	8($ap,$i,8),%rax	# tp[i+1]
    301 	lea	1($i),$i		# i++
    302 	dec	$j			# doesnn't affect CF!
    303 	jnz	.Lsub
    304 
    305 	sbb	\$0,%rax		# handle upmost overflow bit
    306 	xor	$i,$i
    307 	and	%rax,$ap
    308 	not	%rax
    309 	mov	$rp,$np
    310 	and	%rax,$np
    311 	mov	$num,$j			# j=num
    312 	or	$np,$ap			# ap=borrow?tp:rp
    313 .align	16
    314 .Lcopy:					# copy or in-place refresh
    315 	mov	($ap,$i,8),%rax
    316 	mov	$i,(%rsp,$i,8)		# zap temporary vector
    317 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    318 	lea	1($i),$i
    319 	sub	\$1,$j
    320 	jnz	.Lcopy
    321 
    322 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    323 	mov	\$1,%rax
    324 ___
    325 $code.=<<___ if ($win64);
    326 	movaps	(%rsi),%xmm6
    327 	movaps	0x10(%rsi),%xmm7
    328 	lea	0x28(%rsi),%rsi
    329 ___
    330 $code.=<<___;
    331 	mov	(%rsi),%r15
    332 	mov	8(%rsi),%r14
    333 	mov	16(%rsi),%r13
    334 	mov	24(%rsi),%r12
    335 	mov	32(%rsi),%rbp
    336 	mov	40(%rsi),%rbx
    337 	lea	48(%rsi),%rsp
    338 .Lmul_epilogue:
    339 	ret
    340 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    341 ___
    342 {{{
    343 my @A=("%r10","%r11");
    344 my @N=("%r13","%rdi");
    345 $code.=<<___;
    346 .type	bn_mul4x_mont_gather5,\@function,6
    347 .align	16
    348 bn_mul4x_mont_gather5:
    349 .Lmul4x_enter:
    350 	mov	${num}d,${num}d
    351 	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
    352 	push	%rbx
    353 	push	%rbp
    354 	push	%r12
    355 	push	%r13
    356 	push	%r14
    357 	push	%r15
    358 ___
    359 $code.=<<___ if ($win64);
    360 	lea	-0x28(%rsp),%rsp
    361 	movaps	%xmm6,(%rsp)
    362 	movaps	%xmm7,0x10(%rsp)
    363 .Lmul4x_alloca:
    364 ___
    365 $code.=<<___;
    366 	mov	%rsp,%rax
    367 	lea	4($num),%r11
    368 	neg	%r11
    369 	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
    370 	and	\$-1024,%rsp		# minimize TLB usage
    371 
    372 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    373 .Lmul4x_body:
    374 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    375 	mov	%rdx,%r12		# reassign $bp
    376 ___
    377 		$bp="%r12";
    378 		$STRIDE=2**5*8;		# 5 is "window size"
    379 		$N=$STRIDE/4;		# should match cache line size
    380 $code.=<<___;
    381 	mov	%r10,%r11
    382 	shr	\$`log($N/8)/log(2)`,%r10
    383 	and	\$`$N/8-1`,%r11
    384 	not	%r10
    385 	lea	.Lmagic_masks(%rip),%rax
    386 	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    387 	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    388 	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    389 	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    390 	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    391 	movq	24(%rax,%r10,8),%xmm7
    392 
    393 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    394 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    395 	pand	%xmm4,%xmm0
    396 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    397 	pand	%xmm5,%xmm1
    398 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    399 	pand	%xmm6,%xmm2
    400 	por	%xmm1,%xmm0
    401 	pand	%xmm7,%xmm3
    402 	por	%xmm2,%xmm0
    403 	lea	$STRIDE($bp),$bp
    404 	por	%xmm3,%xmm0
    405 
    406 	movq	%xmm0,$m0		# m0=bp[0]
    407 	mov	($n0),$n0		# pull n0[0] value
    408 	mov	($ap),%rax
    409 
    410 	xor	$i,$i			# i=0
    411 	xor	$j,$j			# j=0
    412 
    413 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    414 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    415 	pand	%xmm4,%xmm0
    416 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    417 	pand	%xmm5,%xmm1
    418 
    419 	mov	$n0,$m1
    420 	mulq	$m0			# ap[0]*bp[0]
    421 	mov	%rax,$A[0]
    422 	mov	($np),%rax
    423 
    424 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    425 	pand	%xmm6,%xmm2
    426 	por	%xmm1,%xmm0
    427 	pand	%xmm7,%xmm3
    428 
    429 	imulq	$A[0],$m1		# "tp[0]"*n0
    430 	mov	%rdx,$A[1]
    431 
    432 	por	%xmm2,%xmm0
    433 	lea	$STRIDE($bp),$bp
    434 	por	%xmm3,%xmm0
    435 
    436 	mulq	$m1			# np[0]*m1
    437 	add	%rax,$A[0]		# discarded
    438 	mov	8($ap),%rax
    439 	adc	\$0,%rdx
    440 	mov	%rdx,$N[1]
    441 
    442 	mulq	$m0
    443 	add	%rax,$A[1]
    444 	mov	8($np),%rax
    445 	adc	\$0,%rdx
    446 	mov	%rdx,$A[0]
    447 
    448 	mulq	$m1
    449 	add	%rax,$N[1]
    450 	mov	16($ap),%rax
    451 	adc	\$0,%rdx
    452 	add	$A[1],$N[1]
    453 	lea	4($j),$j		# j++
    454 	adc	\$0,%rdx
    455 	mov	$N[1],(%rsp)
    456 	mov	%rdx,$N[0]
    457 	jmp	.L1st4x
    458 .align	16
    459 .L1st4x:
    460 	mulq	$m0			# ap[j]*bp[0]
    461 	add	%rax,$A[0]
    462 	mov	-16($np,$j,8),%rax
    463 	adc	\$0,%rdx
    464 	mov	%rdx,$A[1]
    465 
    466 	mulq	$m1			# np[j]*m1
    467 	add	%rax,$N[0]
    468 	mov	-8($ap,$j,8),%rax
    469 	adc	\$0,%rdx
    470 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    471 	adc	\$0,%rdx
    472 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    473 	mov	%rdx,$N[1]
    474 
    475 	mulq	$m0			# ap[j]*bp[0]
    476 	add	%rax,$A[1]
    477 	mov	-8($np,$j,8),%rax
    478 	adc	\$0,%rdx
    479 	mov	%rdx,$A[0]
    480 
    481 	mulq	$m1			# np[j]*m1
    482 	add	%rax,$N[1]
    483 	mov	($ap,$j,8),%rax
    484 	adc	\$0,%rdx
    485 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    486 	adc	\$0,%rdx
    487 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    488 	mov	%rdx,$N[0]
    489 
    490 	mulq	$m0			# ap[j]*bp[0]
    491 	add	%rax,$A[0]
    492 	mov	($np,$j,8),%rax
    493 	adc	\$0,%rdx
    494 	mov	%rdx,$A[1]
    495 
    496 	mulq	$m1			# np[j]*m1
    497 	add	%rax,$N[0]
    498 	mov	8($ap,$j,8),%rax
    499 	adc	\$0,%rdx
    500 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    501 	adc	\$0,%rdx
    502 	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    503 	mov	%rdx,$N[1]
    504 
    505 	mulq	$m0			# ap[j]*bp[0]
    506 	add	%rax,$A[1]
    507 	mov	8($np,$j,8),%rax
    508 	adc	\$0,%rdx
    509 	lea	4($j),$j		# j++
    510 	mov	%rdx,$A[0]
    511 
    512 	mulq	$m1			# np[j]*m1
    513 	add	%rax,$N[1]
    514 	mov	-16($ap,$j,8),%rax
    515 	adc	\$0,%rdx
    516 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    517 	adc	\$0,%rdx
    518 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    519 	mov	%rdx,$N[0]
    520 	cmp	$num,$j
    521 	jl	.L1st4x
    522 
    523 	mulq	$m0			# ap[j]*bp[0]
    524 	add	%rax,$A[0]
    525 	mov	-16($np,$j,8),%rax
    526 	adc	\$0,%rdx
    527 	mov	%rdx,$A[1]
    528 
    529 	mulq	$m1			# np[j]*m1
    530 	add	%rax,$N[0]
    531 	mov	-8($ap,$j,8),%rax
    532 	adc	\$0,%rdx
    533 	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    534 	adc	\$0,%rdx
    535 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    536 	mov	%rdx,$N[1]
    537 
    538 	mulq	$m0			# ap[j]*bp[0]
    539 	add	%rax,$A[1]
    540 	mov	-8($np,$j,8),%rax
    541 	adc	\$0,%rdx
    542 	mov	%rdx,$A[0]
    543 
    544 	mulq	$m1			# np[j]*m1
    545 	add	%rax,$N[1]
    546 	mov	($ap),%rax		# ap[0]
    547 	adc	\$0,%rdx
    548 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    549 	adc	\$0,%rdx
    550 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    551 	mov	%rdx,$N[0]
    552 
    553 	movq	%xmm0,$m0		# bp[1]
    554 
    555 	xor	$N[1],$N[1]
    556 	add	$A[0],$N[0]
    557 	adc	\$0,$N[1]
    558 	mov	$N[0],-8(%rsp,$j,8)
    559 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    560 
    561 	lea	1($i),$i		# i++
    562 .align	4
    563 .Louter4x:
    564 	xor	$j,$j			# j=0
    565 	movq	`0*$STRIDE/4-96`($bp),%xmm0
    566 	movq	`1*$STRIDE/4-96`($bp),%xmm1
    567 	pand	%xmm4,%xmm0
    568 	movq	`2*$STRIDE/4-96`($bp),%xmm2
    569 	pand	%xmm5,%xmm1
    570 
    571 	mov	(%rsp),$A[0]
    572 	mov	$n0,$m1
    573 	mulq	$m0			# ap[0]*bp[i]
    574 	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    575 	mov	($np),%rax
    576 	adc	\$0,%rdx
    577 
    578 	movq	`3*$STRIDE/4-96`($bp),%xmm3
    579 	pand	%xmm6,%xmm2
    580 	por	%xmm1,%xmm0
    581 	pand	%xmm7,%xmm3
    582 
    583 	imulq	$A[0],$m1		# tp[0]*n0
    584 	mov	%rdx,$A[1]
    585 
    586 	por	%xmm2,%xmm0
    587 	lea	$STRIDE($bp),$bp
    588 	por	%xmm3,%xmm0
    589 
    590 	mulq	$m1			# np[0]*m1
    591 	add	%rax,$A[0]		# "$N[0]", discarded
    592 	mov	8($ap),%rax
    593 	adc	\$0,%rdx
    594 	mov	%rdx,$N[1]
    595 
    596 	mulq	$m0			# ap[j]*bp[i]
    597 	add	%rax,$A[1]
    598 	mov	8($np),%rax
    599 	adc	\$0,%rdx
    600 	add	8(%rsp),$A[1]		# +tp[1]
    601 	adc	\$0,%rdx
    602 	mov	%rdx,$A[0]
    603 
    604 	mulq	$m1			# np[j]*m1
    605 	add	%rax,$N[1]
    606 	mov	16($ap),%rax
    607 	adc	\$0,%rdx
    608 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    609 	lea	4($j),$j		# j+=2
    610 	adc	\$0,%rdx
    611 	mov	%rdx,$N[0]
    612 	jmp	.Linner4x
    613 .align	16
    614 .Linner4x:
    615 	mulq	$m0			# ap[j]*bp[i]
    616 	add	%rax,$A[0]
    617 	mov	-16($np,$j,8),%rax
    618 	adc	\$0,%rdx
    619 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    620 	adc	\$0,%rdx
    621 	mov	%rdx,$A[1]
    622 
    623 	mulq	$m1			# np[j]*m1
    624 	add	%rax,$N[0]
    625 	mov	-8($ap,$j,8),%rax
    626 	adc	\$0,%rdx
    627 	add	$A[0],$N[0]
    628 	adc	\$0,%rdx
    629 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    630 	mov	%rdx,$N[1]
    631 
    632 	mulq	$m0			# ap[j]*bp[i]
    633 	add	%rax,$A[1]
    634 	mov	-8($np,$j,8),%rax
    635 	adc	\$0,%rdx
    636 	add	-8(%rsp,$j,8),$A[1]
    637 	adc	\$0,%rdx
    638 	mov	%rdx,$A[0]
    639 
    640 	mulq	$m1			# np[j]*m1
    641 	add	%rax,$N[1]
    642 	mov	($ap,$j,8),%rax
    643 	adc	\$0,%rdx
    644 	add	$A[1],$N[1]
    645 	adc	\$0,%rdx
    646 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    647 	mov	%rdx,$N[0]
    648 
    649 	mulq	$m0			# ap[j]*bp[i]
    650 	add	%rax,$A[0]
    651 	mov	($np,$j,8),%rax
    652 	adc	\$0,%rdx
    653 	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    654 	adc	\$0,%rdx
    655 	mov	%rdx,$A[1]
    656 
    657 	mulq	$m1			# np[j]*m1
    658 	add	%rax,$N[0]
    659 	mov	8($ap,$j,8),%rax
    660 	adc	\$0,%rdx
    661 	add	$A[0],$N[0]
    662 	adc	\$0,%rdx
    663 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    664 	mov	%rdx,$N[1]
    665 
    666 	mulq	$m0			# ap[j]*bp[i]
    667 	add	%rax,$A[1]
    668 	mov	8($np,$j,8),%rax
    669 	adc	\$0,%rdx
    670 	add	8(%rsp,$j,8),$A[1]
    671 	adc	\$0,%rdx
    672 	lea	4($j),$j		# j++
    673 	mov	%rdx,$A[0]
    674 
    675 	mulq	$m1			# np[j]*m1
    676 	add	%rax,$N[1]
    677 	mov	-16($ap,$j,8),%rax
    678 	adc	\$0,%rdx
    679 	add	$A[1],$N[1]
    680 	adc	\$0,%rdx
    681 	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
    682 	mov	%rdx,$N[0]
    683 	cmp	$num,$j
    684 	jl	.Linner4x
    685 
    686 	mulq	$m0			# ap[j]*bp[i]
    687 	add	%rax,$A[0]
    688 	mov	-16($np,$j,8),%rax
    689 	adc	\$0,%rdx
    690 	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    691 	adc	\$0,%rdx
    692 	mov	%rdx,$A[1]
    693 
    694 	mulq	$m1			# np[j]*m1
    695 	add	%rax,$N[0]
    696 	mov	-8($ap,$j,8),%rax
    697 	adc	\$0,%rdx
    698 	add	$A[0],$N[0]
    699 	adc	\$0,%rdx
    700 	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    701 	mov	%rdx,$N[1]
    702 
    703 	mulq	$m0			# ap[j]*bp[i]
    704 	add	%rax,$A[1]
    705 	mov	-8($np,$j,8),%rax
    706 	adc	\$0,%rdx
    707 	add	-8(%rsp,$j,8),$A[1]
    708 	adc	\$0,%rdx
    709 	lea	1($i),$i		# i++
    710 	mov	%rdx,$A[0]
    711 
    712 	mulq	$m1			# np[j]*m1
    713 	add	%rax,$N[1]
    714 	mov	($ap),%rax		# ap[0]
    715 	adc	\$0,%rdx
    716 	add	$A[1],$N[1]
    717 	adc	\$0,%rdx
    718 	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    719 	mov	%rdx,$N[0]
    720 
    721 	movq	%xmm0,$m0		# bp[i+1]
    722 	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    723 
    724 	xor	$N[1],$N[1]
    725 	add	$A[0],$N[0]
    726 	adc	\$0,$N[1]
    727 	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    728 	adc	\$0,$N[1]
    729 	mov	$N[0],-8(%rsp,$j,8)
    730 	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    731 
    732 	cmp	$num,$i
    733 	jl	.Louter4x
    734 ___
    735 {
    736 my @ri=("%rax","%rdx",$m0,$m1);
    737 $code.=<<___;
    738 	mov	16(%rsp,$num,8),$rp	# restore $rp
    739 	mov	0(%rsp),@ri[0]		# tp[0]
    740 	pxor	%xmm0,%xmm0
    741 	mov	8(%rsp),@ri[1]		# tp[1]
    742 	shr	\$2,$num		# num/=4
    743 	lea	(%rsp),$ap		# borrow ap for tp
    744 	xor	$i,$i			# i=0 and clear CF!
    745 
    746 	sub	0($np),@ri[0]
    747 	mov	16($ap),@ri[2]		# tp[2]
    748 	mov	24($ap),@ri[3]		# tp[3]
    749 	sbb	8($np),@ri[1]
    750 	lea	-1($num),$j		# j=num/4-1
    751 	jmp	.Lsub4x
    752 .align	16
    753 .Lsub4x:
    754 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    755 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    756 	sbb	16($np,$i,8),@ri[2]
    757 	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    758 	mov	40($ap,$i,8),@ri[1]
    759 	sbb	24($np,$i,8),@ri[3]
    760 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    761 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    762 	sbb	32($np,$i,8),@ri[0]
    763 	mov	48($ap,$i,8),@ri[2]
    764 	mov	56($ap,$i,8),@ri[3]
    765 	sbb	40($np,$i,8),@ri[1]
    766 	lea	4($i),$i		# i++
    767 	dec	$j			# doesnn't affect CF!
    768 	jnz	.Lsub4x
    769 
    770 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    771 	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    772 	sbb	16($np,$i,8),@ri[2]
    773 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    774 	sbb	24($np,$i,8),@ri[3]
    775 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    776 
    777 	sbb	\$0,@ri[0]		# handle upmost overflow bit
    778 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    779 	xor	$i,$i			# i=0
    780 	and	@ri[0],$ap
    781 	not	@ri[0]
    782 	mov	$rp,$np
    783 	and	@ri[0],$np
    784 	lea	-1($num),$j
    785 	or	$np,$ap			# ap=borrow?tp:rp
    786 
    787 	movdqu	($ap),%xmm1
    788 	movdqa	%xmm0,(%rsp)
    789 	movdqu	%xmm1,($rp)
    790 	jmp	.Lcopy4x
    791 .align	16
    792 .Lcopy4x:					# copy or in-place refresh
    793 	movdqu	16($ap,$i),%xmm2
    794 	movdqu	32($ap,$i),%xmm1
    795 	movdqa	%xmm0,16(%rsp,$i)
    796 	movdqu	%xmm2,16($rp,$i)
    797 	movdqa	%xmm0,32(%rsp,$i)
    798 	movdqu	%xmm1,32($rp,$i)
    799 	lea	32($i),$i
    800 	dec	$j
    801 	jnz	.Lcopy4x
    802 
    803 	shl	\$2,$num
    804 	movdqu	16($ap,$i),%xmm2
    805 	movdqa	%xmm0,16(%rsp,$i)
    806 	movdqu	%xmm2,16($rp,$i)
    807 ___
    808 }
    809 $code.=<<___;
    810 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    811 	mov	\$1,%rax
    812 ___
    813 $code.=<<___ if ($win64);
    814 	movaps	(%rsi),%xmm6
    815 	movaps	0x10(%rsi),%xmm7
    816 	lea	0x28(%rsi),%rsi
    817 ___
    818 $code.=<<___;
    819 	mov	(%rsi),%r15
    820 	mov	8(%rsi),%r14
    821 	mov	16(%rsi),%r13
    822 	mov	24(%rsi),%r12
    823 	mov	32(%rsi),%rbp
    824 	mov	40(%rsi),%rbx
    825 	lea	48(%rsi),%rsp
    826 .Lmul4x_epilogue:
    827 	ret
    828 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    829 ___
    830 }}}
    831 
    832 {
    833 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
    834 				("%rdi","%rsi","%rdx","%rcx"); # Unix order
    835 my $out=$inp;
    836 my $STRIDE=2**5*8;
    837 my $N=$STRIDE/4;
    838 
    839 $code.=<<___;
    840 .globl	bn_scatter5
    841 .type	bn_scatter5,\@abi-omnipotent
    842 .align	16
    843 bn_scatter5:
    844 	cmp	\$0, $num
    845 	jz	.Lscatter_epilogue
    846 	lea	($tbl,$idx,8),$tbl
    847 .Lscatter:
    848 	mov	($inp),%rax
    849 	lea	8($inp),$inp
    850 	mov	%rax,($tbl)
    851 	lea	32*8($tbl),$tbl
    852 	sub	\$1,$num
    853 	jnz	.Lscatter
    854 .Lscatter_epilogue:
    855 	ret
    856 .size	bn_scatter5,.-bn_scatter5
    857 
    858 .globl	bn_gather5
    859 .type	bn_gather5,\@abi-omnipotent
    860 .align	16
    861 bn_gather5:
    862 ___
    863 $code.=<<___ if ($win64);
    864 .LSEH_begin_bn_gather5:
    865 	# I can't trust assembler to use specific encoding:-(
    866 	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
    867 	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
    868 	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
    869 ___
    870 $code.=<<___;
    871 	mov	$idx,%r11
    872 	shr	\$`log($N/8)/log(2)`,$idx
    873 	and	\$`$N/8-1`,%r11
    874 	not	$idx
    875 	lea	.Lmagic_masks(%rip),%rax
    876 	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
    877 	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
    878 	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
    879 	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
    880 	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
    881 	movq	24(%rax,$idx,8),%xmm7
    882 	jmp	.Lgather
    883 .align	16
    884 .Lgather:
    885 	movq	`0*$STRIDE/4-96`($tbl),%xmm0
    886 	movq	`1*$STRIDE/4-96`($tbl),%xmm1
    887 	pand	%xmm4,%xmm0
    888 	movq	`2*$STRIDE/4-96`($tbl),%xmm2
    889 	pand	%xmm5,%xmm1
    890 	movq	`3*$STRIDE/4-96`($tbl),%xmm3
    891 	pand	%xmm6,%xmm2
    892 	por	%xmm1,%xmm0
    893 	pand	%xmm7,%xmm3
    894 	por	%xmm2,%xmm0
    895 	lea	$STRIDE($tbl),$tbl
    896 	por	%xmm3,%xmm0
    897 
    898 	movq	%xmm0,($out)		# m0=bp[0]
    899 	lea	8($out),$out
    900 	sub	\$1,$num
    901 	jnz	.Lgather
    902 ___
    903 $code.=<<___ if ($win64);
    904 	movaps	%xmm6,(%rsp)
    905 	movaps	%xmm7,0x10(%rsp)
    906 	lea	0x28(%rsp),%rsp
    907 ___
    908 $code.=<<___;
    909 	ret
    910 .LSEH_end_bn_gather5:
    911 .size	bn_gather5,.-bn_gather5
    912 ___
    913 }
    914 $code.=<<___;
    915 .align	64
    916 .Lmagic_masks:
    917 	.long	0,0, 0,0, 0,0, -1,-1
    918 	.long	0,0, 0,0, 0,0,  0,0
    919 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    920 ___
    921 
    922 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
    923 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
    924 if ($win64) {
    925 $rec="%rcx";
    926 $frame="%rdx";
    927 $context="%r8";
    928 $disp="%r9";
    929 
    930 $code.=<<___;
    931 .extern	__imp_RtlVirtualUnwind
    932 .type	mul_handler,\@abi-omnipotent
    933 .align	16
    934 mul_handler:
    935 	push	%rsi
    936 	push	%rdi
    937 	push	%rbx
    938 	push	%rbp
    939 	push	%r12
    940 	push	%r13
    941 	push	%r14
    942 	push	%r15
    943 	pushfq
    944 	sub	\$64,%rsp
    945 
    946 	mov	120($context),%rax	# pull context->Rax
    947 	mov	248($context),%rbx	# pull context->Rip
    948 
    949 	mov	8($disp),%rsi		# disp->ImageBase
    950 	mov	56($disp),%r11		# disp->HandlerData
    951 
    952 	mov	0(%r11),%r10d		# HandlerData[0]
    953 	lea	(%rsi,%r10),%r10	# end of prologue label
    954 	cmp	%r10,%rbx		# context->Rip<end of prologue label
    955 	jb	.Lcommon_seh_tail
    956 
    957 	lea	`40+48`(%rax),%rax
    958 
    959 	mov	4(%r11),%r10d		# HandlerData[1]
    960 	lea	(%rsi,%r10),%r10	# end of alloca label
    961 	cmp	%r10,%rbx		# context->Rip<end of alloca label
    962 	jb	.Lcommon_seh_tail
    963 
    964 	mov	152($context),%rax	# pull context->Rsp
    965 
    966 	mov	8(%r11),%r10d		# HandlerData[2]
    967 	lea	(%rsi,%r10),%r10	# epilogue label
    968 	cmp	%r10,%rbx		# context->Rip>=epilogue label
    969 	jae	.Lcommon_seh_tail
    970 
    971 	mov	192($context),%r10	# pull $num
    972 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
    973 
    974 	movaps	(%rax),%xmm0
    975 	movaps	16(%rax),%xmm1
    976 	lea	`40+48`(%rax),%rax
    977 
    978 	mov	-8(%rax),%rbx
    979 	mov	-16(%rax),%rbp
    980 	mov	-24(%rax),%r12
    981 	mov	-32(%rax),%r13
    982 	mov	-40(%rax),%r14
    983 	mov	-48(%rax),%r15
    984 	mov	%rbx,144($context)	# restore context->Rbx
    985 	mov	%rbp,160($context)	# restore context->Rbp
    986 	mov	%r12,216($context)	# restore context->R12
    987 	mov	%r13,224($context)	# restore context->R13
    988 	mov	%r14,232($context)	# restore context->R14
    989 	mov	%r15,240($context)	# restore context->R15
    990 	movups	%xmm0,512($context)	# restore context->Xmm6
    991 	movups	%xmm1,528($context)	# restore context->Xmm7
    992 
    993 .Lcommon_seh_tail:
    994 	mov	8(%rax),%rdi
    995 	mov	16(%rax),%rsi
    996 	mov	%rax,152($context)	# restore context->Rsp
    997 	mov	%rsi,168($context)	# restore context->Rsi
    998 	mov	%rdi,176($context)	# restore context->Rdi
    999 
   1000 	mov	40($disp),%rdi		# disp->ContextRecord
   1001 	mov	$context,%rsi		# context
   1002 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1003 	.long	0xa548f3fc		# cld; rep movsq
   1004 
   1005 	mov	$disp,%rsi
   1006 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1007 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1008 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1009 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1010 	mov	40(%rsi),%r10		# disp->ContextRecord
   1011 	lea	56(%rsi),%r11		# &disp->HandlerData
   1012 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1013 	mov	%r10,32(%rsp)		# arg5
   1014 	mov	%r11,40(%rsp)		# arg6
   1015 	mov	%r12,48(%rsp)		# arg7
   1016 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1017 	call	*__imp_RtlVirtualUnwind(%rip)
   1018 
   1019 	mov	\$1,%eax		# ExceptionContinueSearch
   1020 	add	\$64,%rsp
   1021 	popfq
   1022 	pop	%r15
   1023 	pop	%r14
   1024 	pop	%r13
   1025 	pop	%r12
   1026 	pop	%rbp
   1027 	pop	%rbx
   1028 	pop	%rdi
   1029 	pop	%rsi
   1030 	ret
   1031 .size	mul_handler,.-mul_handler
   1032 
   1033 .section	.pdata
   1034 .align	4
   1035 	.rva	.LSEH_begin_bn_mul_mont_gather5
   1036 	.rva	.LSEH_end_bn_mul_mont_gather5
   1037 	.rva	.LSEH_info_bn_mul_mont_gather5
   1038 
   1039 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
   1040 	.rva	.LSEH_end_bn_mul4x_mont_gather5
   1041 	.rva	.LSEH_info_bn_mul4x_mont_gather5
   1042 
   1043 	.rva	.LSEH_begin_bn_gather5
   1044 	.rva	.LSEH_end_bn_gather5
   1045 	.rva	.LSEH_info_bn_gather5
   1046 
   1047 .section	.xdata
   1048 .align	8
   1049 .LSEH_info_bn_mul_mont_gather5:
   1050 	.byte	9,0,0,0
   1051 	.rva	mul_handler
   1052 	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
   1053 .align	8
   1054 .LSEH_info_bn_mul4x_mont_gather5:
   1055 	.byte	9,0,0,0
   1056 	.rva	mul_handler
   1057 	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
   1058 .align	8
   1059 .LSEH_info_bn_gather5:
   1060         .byte   0x01,0x0d,0x05,0x00
   1061         .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
   1062         .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
   1063         .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
   1064 .align	8
   1065 ___
   1066 }
   1067 
   1068 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1069 
   1070 print $code;
   1071 close STDOUT;
   1072