Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # Copyright (c) 2014, Intel Corporation.
      4 #
      5 # Permission to use, copy, modify, and/or distribute this software for any
      6 # purpose with or without fee is hereby granted, provided that the above
      7 # copyright notice and this permission notice appear in all copies.
      8 #
      9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     16 
     17 # Developers and authors:
     18 # Shay Gueron (1, 2), and Vlad Krasnov (1)
     19 # (1) Intel Corporation, Israel Development Center
     20 # (2) University of Haifa
     21 
     22 #  Reference:
     23 #  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
     24 #                           256 Bit Primes"
     25 
     26 # Further optimization by <appro (at] openssl.org>:
     27 #
     28 #		this/original
     29 # Opteron	+12-49%
     30 # Bulldozer	+14-45%
     31 # P4		+18-46%
     32 # Westmere	+12-34%
     33 # Sandy Bridge	+9-35%
     34 # Ivy Bridge	+9-35%
     35 # Haswell	+8-37%
     36 # Broadwell	+18-58%
     37 # Atom		+15-50%
     38 # VIA Nano	+43-160%
     39 #
     40 # Ranges denote minimum and maximum improvement coefficients depending
     41 # on benchmark.
     42 
     43 $flavour = shift;
     44 $output  = shift;
     45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     46 
     47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     48 
     49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     52 die "can't locate x86_64-xlate.pl";
     53 
     54 open OUT,"| \"$^X\" $xlate $flavour $output";
     55 *STDOUT=*OUT;
     56 
     57 # TODO: enable these after testing. $avx goes to two and $addx to one.
     58 $avx=0;
     59 $addx=0;
     60 
     61 $code.=<<___;
     62 .text
     63 .extern	OPENSSL_ia32cap_P
     64 
     65 # The polynomial
     66 .align 64
     67 .Lpoly:
     68 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
     69 
     70 .LOne:
     71 .long 1,1,1,1,1,1,1,1
     72 .LTwo:
     73 .long 2,2,2,2,2,2,2,2
     74 .LThree:
     75 .long 3,3,3,3,3,3,3,3
     76 .LONE_mont:
     77 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
     78 ___
     79 
     80 {
     81 ################################################################################
     82 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
     83 
     84 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
     85 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
     86 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
     87 
     88 $code.=<<___;
     89 
     90 .type	ecp_nistz256_mul_by_2,\@function,2
     91 .align	64
     92 ecp_nistz256_mul_by_2:
     93 	push	%r12
     94 	push	%r13
     95 
     96 	mov	8*0($a_ptr), $a0
     97 	mov	8*1($a_ptr), $a1
     98 	add	$a0, $a0		# a0:a3+a0:a3
     99 	mov	8*2($a_ptr), $a2
    100 	adc	$a1, $a1
    101 	mov	8*3($a_ptr), $a3
    102 	lea	.Lpoly(%rip), $a_ptr
    103 	 mov	$a0, $t0
    104 	adc	$a2, $a2
    105 	adc	$a3, $a3
    106 	 mov	$a1, $t1
    107 	sbb	$t4, $t4
    108 
    109 	sub	8*0($a_ptr), $a0
    110 	 mov	$a2, $t2
    111 	sbb	8*1($a_ptr), $a1
    112 	sbb	8*2($a_ptr), $a2
    113 	 mov	$a3, $t3
    114 	sbb	8*3($a_ptr), $a3
    115 	test	$t4, $t4
    116 
    117 	cmovz	$t0, $a0
    118 	cmovz	$t1, $a1
    119 	mov	$a0, 8*0($r_ptr)
    120 	cmovz	$t2, $a2
    121 	mov	$a1, 8*1($r_ptr)
    122 	cmovz	$t3, $a3
    123 	mov	$a2, 8*2($r_ptr)
    124 	mov	$a3, 8*3($r_ptr)
    125 
    126 	pop	%r13
    127 	pop	%r12
    128 	ret
    129 .size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
    130 
    131 ################################################################################
    132 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
    133 .globl	ecp_nistz256_neg
    134 .type	ecp_nistz256_neg,\@function,2
    135 .align	32
    136 ecp_nistz256_neg:
    137 	push	%r12
    138 	push	%r13
    139 
    140 	xor	$a0, $a0
    141 	xor	$a1, $a1
    142 	xor	$a2, $a2
    143 	xor	$a3, $a3
    144 	xor	$t4, $t4
    145 
    146 	sub	8*0($a_ptr), $a0
    147 	sbb	8*1($a_ptr), $a1
    148 	sbb	8*2($a_ptr), $a2
    149 	 mov	$a0, $t0
    150 	sbb	8*3($a_ptr), $a3
    151 	lea	.Lpoly(%rip), $a_ptr
    152 	 mov	$a1, $t1
    153 	sbb	\$0, $t4
    154 
    155 	add	8*0($a_ptr), $a0
    156 	 mov	$a2, $t2
    157 	adc	8*1($a_ptr), $a1
    158 	adc	8*2($a_ptr), $a2
    159 	 mov	$a3, $t3
    160 	adc	8*3($a_ptr), $a3
    161 	test	$t4, $t4
    162 
    163 	cmovz	$t0, $a0
    164 	cmovz	$t1, $a1
    165 	mov	$a0, 8*0($r_ptr)
    166 	cmovz	$t2, $a2
    167 	mov	$a1, 8*1($r_ptr)
    168 	cmovz	$t3, $a3
    169 	mov	$a2, 8*2($r_ptr)
    170 	mov	$a3, 8*3($r_ptr)
    171 
    172 	pop %r13
    173 	pop %r12
    174 	ret
    175 .size	ecp_nistz256_neg,.-ecp_nistz256_neg
    176 ___
    177 }
    178 {
    179 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
    180 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
    181 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
    182 my ($poly1,$poly3)=($acc6,$acc7);
    183 
    184 $code.=<<___;
    185 ################################################################################
    186 # void ecp_nistz256_mul_mont(
    187 #   uint64_t res[4],
    188 #   uint64_t a[4],
    189 #   uint64_t b[4]);
    190 
    191 .globl	ecp_nistz256_mul_mont
    192 .type	ecp_nistz256_mul_mont,\@function,3
    193 .align	32
    194 ecp_nistz256_mul_mont:
    195 ___
    196 $code.=<<___	if ($addx);
    197 	mov	\$0x80100, %ecx
    198 	and	OPENSSL_ia32cap_P+8(%rip), %ecx
    199 ___
    200 $code.=<<___;
    201 .Lmul_mont:
    202 	push	%rbp
    203 	push	%rbx
    204 	push	%r12
    205 	push	%r13
    206 	push	%r14
    207 	push	%r15
    208 ___
    209 $code.=<<___	if ($addx);
    210 	cmp	\$0x80100, %ecx
    211 	je	.Lmul_montx
    212 ___
    213 $code.=<<___;
    214 	mov	$b_org, $b_ptr
    215 	mov	8*0($b_org), %rax
    216 	mov	8*0($a_ptr), $acc1
    217 	mov	8*1($a_ptr), $acc2
    218 	mov	8*2($a_ptr), $acc3
    219 	mov	8*3($a_ptr), $acc4
    220 
    221 	call	__ecp_nistz256_mul_montq
    222 ___
    223 $code.=<<___	if ($addx);
    224 	jmp	.Lmul_mont_done
    225 
    226 .align	32
    227 .Lmul_montx:
    228 	mov	$b_org, $b_ptr
    229 	mov	8*0($b_org), %rdx
    230 	mov	8*0($a_ptr), $acc1
    231 	mov	8*1($a_ptr), $acc2
    232 	mov	8*2($a_ptr), $acc3
    233 	mov	8*3($a_ptr), $acc4
    234 	lea	-128($a_ptr), $a_ptr	# control u-op density
    235 
    236 	call	__ecp_nistz256_mul_montx
    237 ___
    238 $code.=<<___;
    239 .Lmul_mont_done:
    240 	pop	%r15
    241 	pop	%r14
    242 	pop	%r13
    243 	pop	%r12
    244 	pop	%rbx
    245 	pop	%rbp
    246 	ret
    247 .size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
    248 
    249 .type	__ecp_nistz256_mul_montq,\@abi-omnipotent
    250 .align	32
    251 __ecp_nistz256_mul_montq:
    252 	########################################################################
    253 	# Multiply a by b[0]
    254 	mov	%rax, $t1
    255 	mulq	$acc1
    256 	mov	.Lpoly+8*1(%rip),$poly1
    257 	mov	%rax, $acc0
    258 	mov	$t1, %rax
    259 	mov	%rdx, $acc1
    260 
    261 	mulq	$acc2
    262 	mov	.Lpoly+8*3(%rip),$poly3
    263 	add	%rax, $acc1
    264 	mov	$t1, %rax
    265 	adc	\$0, %rdx
    266 	mov	%rdx, $acc2
    267 
    268 	mulq	$acc3
    269 	add	%rax, $acc2
    270 	mov	$t1, %rax
    271 	adc	\$0, %rdx
    272 	mov	%rdx, $acc3
    273 
    274 	mulq	$acc4
    275 	add	%rax, $acc3
    276 	 mov	$acc0, %rax
    277 	adc	\$0, %rdx
    278 	xor	$acc5, $acc5
    279 	mov	%rdx, $acc4
    280 
    281 	########################################################################
    282 	# First reduction step
    283 	# Basically now we want to multiply acc[0] by p256,
    284 	# and add the result to the acc.
    285 	# Due to the special form of p256 we do some optimizations
    286 	#
    287 	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
    288 	# then we add acc[0] and get acc[0] x 2^96
    289 
    290 	mov	$acc0, $t1
    291 	shl	\$32, $acc0
    292 	mulq	$poly3
    293 	shr	\$32, $t1
    294 	add	$acc0, $acc1		# +=acc[0]<<96
    295 	adc	$t1, $acc2
    296 	adc	%rax, $acc3
    297 	 mov	8*1($b_ptr), %rax
    298 	adc	%rdx, $acc4
    299 	adc	\$0, $acc5
    300 	xor	$acc0, $acc0
    301 
    302 	########################################################################
    303 	# Multiply by b[1]
    304 	mov	%rax, $t1
    305 	mulq	8*0($a_ptr)
    306 	add	%rax, $acc1
    307 	mov	$t1, %rax
    308 	adc	\$0, %rdx
    309 	mov	%rdx, $t0
    310 
    311 	mulq	8*1($a_ptr)
    312 	add	$t0, $acc2
    313 	adc	\$0, %rdx
    314 	add	%rax, $acc2
    315 	mov	$t1, %rax
    316 	adc	\$0, %rdx
    317 	mov	%rdx, $t0
    318 
    319 	mulq	8*2($a_ptr)
    320 	add	$t0, $acc3
    321 	adc	\$0, %rdx
    322 	add	%rax, $acc3
    323 	mov	$t1, %rax
    324 	adc	\$0, %rdx
    325 	mov	%rdx, $t0
    326 
    327 	mulq	8*3($a_ptr)
    328 	add	$t0, $acc4
    329 	adc	\$0, %rdx
    330 	add	%rax, $acc4
    331 	 mov	$acc1, %rax
    332 	adc	%rdx, $acc5
    333 	adc	\$0, $acc0
    334 
    335 	########################################################################
    336 	# Second reduction step
    337 	mov	$acc1, $t1
    338 	shl	\$32, $acc1
    339 	mulq	$poly3
    340 	shr	\$32, $t1
    341 	add	$acc1, $acc2
    342 	adc	$t1, $acc3
    343 	adc	%rax, $acc4
    344 	 mov	8*2($b_ptr), %rax
    345 	adc	%rdx, $acc5
    346 	adc	\$0, $acc0
    347 	xor	$acc1, $acc1
    348 
    349 	########################################################################
    350 	# Multiply by b[2]
    351 	mov	%rax, $t1
    352 	mulq	8*0($a_ptr)
    353 	add	%rax, $acc2
    354 	mov	$t1, %rax
    355 	adc	\$0, %rdx
    356 	mov	%rdx, $t0
    357 
    358 	mulq	8*1($a_ptr)
    359 	add	$t0, $acc3
    360 	adc	\$0, %rdx
    361 	add	%rax, $acc3
    362 	mov	$t1, %rax
    363 	adc	\$0, %rdx
    364 	mov	%rdx, $t0
    365 
    366 	mulq	8*2($a_ptr)
    367 	add	$t0, $acc4
    368 	adc	\$0, %rdx
    369 	add	%rax, $acc4
    370 	mov	$t1, %rax
    371 	adc	\$0, %rdx
    372 	mov	%rdx, $t0
    373 
    374 	mulq	8*3($a_ptr)
    375 	add	$t0, $acc5
    376 	adc	\$0, %rdx
    377 	add	%rax, $acc5
    378 	 mov	$acc2, %rax
    379 	adc	%rdx, $acc0
    380 	adc	\$0, $acc1
    381 
    382 	########################################################################
    383 	# Third reduction step
    384 	mov	$acc2, $t1
    385 	shl	\$32, $acc2
    386 	mulq	$poly3
    387 	shr	\$32, $t1
    388 	add	$acc2, $acc3
    389 	adc	$t1, $acc4
    390 	adc	%rax, $acc5
    391 	 mov	8*3($b_ptr), %rax
    392 	adc	%rdx, $acc0
    393 	adc	\$0, $acc1
    394 	xor	$acc2, $acc2
    395 
    396 	########################################################################
    397 	# Multiply by b[3]
    398 	mov	%rax, $t1
    399 	mulq	8*0($a_ptr)
    400 	add	%rax, $acc3
    401 	mov	$t1, %rax
    402 	adc	\$0, %rdx
    403 	mov	%rdx, $t0
    404 
    405 	mulq	8*1($a_ptr)
    406 	add	$t0, $acc4
    407 	adc	\$0, %rdx
    408 	add	%rax, $acc4
    409 	mov	$t1, %rax
    410 	adc	\$0, %rdx
    411 	mov	%rdx, $t0
    412 
    413 	mulq	8*2($a_ptr)
    414 	add	$t0, $acc5
    415 	adc	\$0, %rdx
    416 	add	%rax, $acc5
    417 	mov	$t1, %rax
    418 	adc	\$0, %rdx
    419 	mov	%rdx, $t0
    420 
    421 	mulq	8*3($a_ptr)
    422 	add	$t0, $acc0
    423 	adc	\$0, %rdx
    424 	add	%rax, $acc0
    425 	 mov	$acc3, %rax
    426 	adc	%rdx, $acc1
    427 	adc	\$0, $acc2
    428 
    429 	########################################################################
    430 	# Final reduction step
    431 	mov	$acc3, $t1
    432 	shl	\$32, $acc3
    433 	mulq	$poly3
    434 	shr	\$32, $t1
    435 	add	$acc3, $acc4
    436 	adc	$t1, $acc5
    437 	 mov	$acc4, $t0
    438 	adc	%rax, $acc0
    439 	adc	%rdx, $acc1
    440 	 mov	$acc5, $t1
    441 	adc	\$0, $acc2
    442 
    443 	########################################################################
    444 	# Branch-less conditional subtraction of P
    445 	sub	\$-1, $acc4		# .Lpoly[0]
    446 	 mov	$acc0, $t2
    447 	sbb	$poly1, $acc5		# .Lpoly[1]
    448 	sbb	\$0, $acc0		# .Lpoly[2]
    449 	 mov	$acc1, $t3
    450 	sbb	$poly3, $acc1		# .Lpoly[3]
    451 	sbb	\$0, $acc2
    452 
    453 	cmovc	$t0, $acc4
    454 	cmovc	$t1, $acc5
    455 	mov	$acc4, 8*0($r_ptr)
    456 	cmovc	$t2, $acc0
    457 	mov	$acc5, 8*1($r_ptr)
    458 	cmovc	$t3, $acc1
    459 	mov	$acc0, 8*2($r_ptr)
    460 	mov	$acc1, 8*3($r_ptr)
    461 
    462 	ret
    463 .size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
    464 
    465 ################################################################################
    466 # void ecp_nistz256_sqr_mont(
    467 #   uint64_t res[4],
    468 #   uint64_t a[4]);
    469 
    470 # we optimize the square according to S.Gueron and V.Krasnov,
    471 # "Speeding up Big-Number Squaring"
    472 .globl	ecp_nistz256_sqr_mont
    473 .type	ecp_nistz256_sqr_mont,\@function,2
    474 .align	32
    475 ecp_nistz256_sqr_mont:
    476 ___
    477 $code.=<<___	if ($addx);
    478 	mov	\$0x80100, %ecx
    479 	and	OPENSSL_ia32cap_P+8(%rip), %ecx
    480 ___
    481 $code.=<<___;
    482 	push	%rbp
    483 	push	%rbx
    484 	push	%r12
    485 	push	%r13
    486 	push	%r14
    487 	push	%r15
    488 ___
    489 $code.=<<___	if ($addx);
    490 	cmp	\$0x80100, %ecx
    491 	je	.Lsqr_montx
    492 ___
    493 $code.=<<___;
    494 	mov	8*0($a_ptr), %rax
    495 	mov	8*1($a_ptr), $acc6
    496 	mov	8*2($a_ptr), $acc7
    497 	mov	8*3($a_ptr), $acc0
    498 
    499 	call	__ecp_nistz256_sqr_montq
    500 ___
    501 $code.=<<___	if ($addx);
    502 	jmp	.Lsqr_mont_done
    503 
    504 .align	32
    505 .Lsqr_montx:
    506 	mov	8*0($a_ptr), %rdx
    507 	mov	8*1($a_ptr), $acc6
    508 	mov	8*2($a_ptr), $acc7
    509 	mov	8*3($a_ptr), $acc0
    510 	lea	-128($a_ptr), $a_ptr	# control u-op density
    511 
    512 	call	__ecp_nistz256_sqr_montx
    513 ___
    514 $code.=<<___;
    515 .Lsqr_mont_done:
    516 	pop	%r15
    517 	pop	%r14
    518 	pop	%r13
    519 	pop	%r12
    520 	pop	%rbx
    521 	pop	%rbp
    522 	ret
    523 .size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
    524 
    525 .type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
    526 .align	32
    527 __ecp_nistz256_sqr_montq:
    528 	mov	%rax, $acc5
    529 	mulq	$acc6			# a[1]*a[0]
    530 	mov	%rax, $acc1
    531 	mov	$acc7, %rax
    532 	mov	%rdx, $acc2
    533 
    534 	mulq	$acc5			# a[0]*a[2]
    535 	add	%rax, $acc2
    536 	mov	$acc0, %rax
    537 	adc	\$0, %rdx
    538 	mov	%rdx, $acc3
    539 
    540 	mulq	$acc5			# a[0]*a[3]
    541 	add	%rax, $acc3
    542 	 mov	$acc7, %rax
    543 	adc	\$0, %rdx
    544 	mov	%rdx, $acc4
    545 
    546 	#################################
    547 	mulq	$acc6			# a[1]*a[2]
    548 	add	%rax, $acc3
    549 	mov	$acc0, %rax
    550 	adc	\$0, %rdx
    551 	mov	%rdx, $t1
    552 
    553 	mulq	$acc6			# a[1]*a[3]
    554 	add	%rax, $acc4
    555 	 mov	$acc0, %rax
    556 	adc	\$0, %rdx
    557 	add	$t1, $acc4
    558 	mov	%rdx, $acc5
    559 	adc	\$0, $acc5
    560 
    561 	#################################
    562 	mulq	$acc7			# a[2]*a[3]
    563 	xor	$acc7, $acc7
    564 	add	%rax, $acc5
    565 	 mov	8*0($a_ptr), %rax
    566 	mov	%rdx, $acc6
    567 	adc	\$0, $acc6
    568 
    569 	add	$acc1, $acc1		# acc1:6<<1
    570 	adc	$acc2, $acc2
    571 	adc	$acc3, $acc3
    572 	adc	$acc4, $acc4
    573 	adc	$acc5, $acc5
    574 	adc	$acc6, $acc6
    575 	adc	\$0, $acc7
    576 
    577 	mulq	%rax
    578 	mov	%rax, $acc0
    579 	mov	8*1($a_ptr), %rax
    580 	mov	%rdx, $t0
    581 
    582 	mulq	%rax
    583 	add	$t0, $acc1
    584 	adc	%rax, $acc2
    585 	mov	8*2($a_ptr), %rax
    586 	adc	\$0, %rdx
    587 	mov	%rdx, $t0
    588 
    589 	mulq	%rax
    590 	add	$t0, $acc3
    591 	adc	%rax, $acc4
    592 	mov	8*3($a_ptr), %rax
    593 	adc	\$0, %rdx
    594 	mov	%rdx, $t0
    595 
    596 	mulq	%rax
    597 	add	$t0, $acc5
    598 	adc	%rax, $acc6
    599 	 mov	$acc0, %rax
    600 	adc	%rdx, $acc7
    601 
    602 	mov	.Lpoly+8*1(%rip), $a_ptr
    603 	mov	.Lpoly+8*3(%rip), $t1
    604 
    605 	##########################################
    606 	# Now the reduction
    607 	# First iteration
    608 	mov	$acc0, $t0
    609 	shl	\$32, $acc0
    610 	mulq	$t1
    611 	shr	\$32, $t0
    612 	add	$acc0, $acc1		# +=acc[0]<<96
    613 	adc	$t0, $acc2
    614 	adc	%rax, $acc3
    615 	 mov	$acc1, %rax
    616 	adc	\$0, %rdx
    617 
    618 	##########################################
    619 	# Second iteration
    620 	mov	$acc1, $t0
    621 	shl	\$32, $acc1
    622 	mov	%rdx, $acc0
    623 	mulq	$t1
    624 	shr	\$32, $t0
    625 	add	$acc1, $acc2
    626 	adc	$t0, $acc3
    627 	adc	%rax, $acc0
    628 	 mov	$acc2, %rax
    629 	adc	\$0, %rdx
    630 
    631 	##########################################
    632 	# Third iteration
    633 	mov	$acc2, $t0
    634 	shl	\$32, $acc2
    635 	mov	%rdx, $acc1
    636 	mulq	$t1
    637 	shr	\$32, $t0
    638 	add	$acc2, $acc3
    639 	adc	$t0, $acc0
    640 	adc	%rax, $acc1
    641 	 mov	$acc3, %rax
    642 	adc	\$0, %rdx
    643 
    644 	###########################################
    645 	# Last iteration
    646 	mov	$acc3, $t0
    647 	shl	\$32, $acc3
    648 	mov	%rdx, $acc2
    649 	mulq	$t1
    650 	shr	\$32, $t0
    651 	add	$acc3, $acc0
    652 	adc	$t0, $acc1
    653 	adc	%rax, $acc2
    654 	adc	\$0, %rdx
    655 	xor	$acc3, $acc3
    656 
    657 	############################################
    658 	# Add the rest of the acc
    659 	add	$acc0, $acc4
    660 	adc	$acc1, $acc5
    661 	 mov	$acc4, $acc0
    662 	adc	$acc2, $acc6
    663 	adc	%rdx, $acc7
    664 	 mov	$acc5, $acc1
    665 	adc	\$0, $acc3
    666 
    667 	sub	\$-1, $acc4		# .Lpoly[0]
    668 	 mov	$acc6, $acc2
    669 	sbb	$a_ptr, $acc5		# .Lpoly[1]
    670 	sbb	\$0, $acc6		# .Lpoly[2]
    671 	 mov	$acc7, $t0
    672 	sbb	$t1, $acc7		# .Lpoly[3]
    673 	sbb	\$0, $acc3
    674 
    675 	cmovc	$acc0, $acc4
    676 	cmovc	$acc1, $acc5
    677 	mov	$acc4, 8*0($r_ptr)
    678 	cmovc	$acc2, $acc6
    679 	mov	$acc5, 8*1($r_ptr)
    680 	cmovc	$t0, $acc7
    681 	mov	$acc6, 8*2($r_ptr)
    682 	mov	$acc7, 8*3($r_ptr)
    683 
    684 	ret
    685 .size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
    686 ___
    687 
    688 if ($addx) {
    689 $code.=<<___;
    690 .type	__ecp_nistz256_mul_montx,\@abi-omnipotent
    691 .align	32
    692 __ecp_nistz256_mul_montx:
    693 	########################################################################
    694 	# Multiply by b[0]
    695 	mulx	$acc1, $acc0, $acc1
    696 	mulx	$acc2, $t0, $acc2
    697 	mov	\$32, $poly1
    698 	xor	$acc5, $acc5		# cf=0
    699 	mulx	$acc3, $t1, $acc3
    700 	mov	.Lpoly+8*3(%rip), $poly3
    701 	adc	$t0, $acc1
    702 	mulx	$acc4, $t0, $acc4
    703 	 mov	$acc0, %rdx
    704 	adc	$t1, $acc2
    705 	 shlx	$poly1,$acc0,$t1
    706 	adc	$t0, $acc3
    707 	 shrx	$poly1,$acc0,$t0
    708 	adc	\$0, $acc4
    709 
    710 	########################################################################
    711 	# First reduction step
    712 	add	$t1, $acc1
    713 	adc	$t0, $acc2
    714 
    715 	mulx	$poly3, $t0, $t1
    716 	 mov	8*1($b_ptr), %rdx
    717 	adc	$t0, $acc3
    718 	adc	$t1, $acc4
    719 	adc	\$0, $acc5
    720 	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
    721 
    722 	########################################################################
    723 	# Multiply by b[1]
    724 	mulx	8*0+128($a_ptr), $t0, $t1
    725 	adcx	$t0, $acc1
    726 	adox	$t1, $acc2
    727 
    728 	mulx	8*1+128($a_ptr), $t0, $t1
    729 	adcx	$t0, $acc2
    730 	adox	$t1, $acc3
    731 
    732 	mulx	8*2+128($a_ptr), $t0, $t1
    733 	adcx	$t0, $acc3
    734 	adox	$t1, $acc4
    735 
    736 	mulx	8*3+128($a_ptr), $t0, $t1
    737 	 mov	$acc1, %rdx
    738 	adcx	$t0, $acc4
    739 	 shlx	$poly1, $acc1, $t0
    740 	adox	$t1, $acc5
    741 	 shrx	$poly1, $acc1, $t1
    742 
    743 	adcx	$acc0, $acc5
    744 	adox	$acc0, $acc0
    745 	adc	\$0, $acc0
    746 
    747 	########################################################################
    748 	# Second reduction step
    749 	add	$t0, $acc2
    750 	adc	$t1, $acc3
    751 
    752 	mulx	$poly3, $t0, $t1
    753 	 mov	8*2($b_ptr), %rdx
    754 	adc	$t0, $acc4
    755 	adc	$t1, $acc5
    756 	adc	\$0, $acc0
    757 	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
    758 
    759 	########################################################################
    760 	# Multiply by b[2]
    761 	mulx	8*0+128($a_ptr), $t0, $t1
    762 	adcx	$t0, $acc2
    763 	adox	$t1, $acc3
    764 
    765 	mulx	8*1+128($a_ptr), $t0, $t1
    766 	adcx	$t0, $acc3
    767 	adox	$t1, $acc4
    768 
    769 	mulx	8*2+128($a_ptr), $t0, $t1
    770 	adcx	$t0, $acc4
    771 	adox	$t1, $acc5
    772 
    773 	mulx	8*3+128($a_ptr), $t0, $t1
    774 	 mov	$acc2, %rdx
    775 	adcx	$t0, $acc5
    776 	 shlx	$poly1, $acc2, $t0
    777 	adox	$t1, $acc0
    778 	 shrx	$poly1, $acc2, $t1
    779 
    780 	adcx	$acc1, $acc0
    781 	adox	$acc1, $acc1
    782 	adc	\$0, $acc1
    783 
    784 	########################################################################
    785 	# Third reduction step
    786 	add	$t0, $acc3
    787 	adc	$t1, $acc4
    788 
    789 	mulx	$poly3, $t0, $t1
    790 	 mov	8*3($b_ptr), %rdx
    791 	adc	$t0, $acc5
    792 	adc	$t1, $acc0
    793 	adc	\$0, $acc1
    794 	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
    795 
    796 	########################################################################
    797 	# Multiply by b[3]
    798 	mulx	8*0+128($a_ptr), $t0, $t1
    799 	adcx	$t0, $acc3
    800 	adox	$t1, $acc4
    801 
    802 	mulx	8*1+128($a_ptr), $t0, $t1
    803 	adcx	$t0, $acc4
    804 	adox	$t1, $acc5
    805 
    806 	mulx	8*2+128($a_ptr), $t0, $t1
    807 	adcx	$t0, $acc5
    808 	adox	$t1, $acc0
    809 
    810 	mulx	8*3+128($a_ptr), $t0, $t1
    811 	 mov	$acc3, %rdx
    812 	adcx	$t0, $acc0
    813 	 shlx	$poly1, $acc3, $t0
    814 	adox	$t1, $acc1
    815 	 shrx	$poly1, $acc3, $t1
    816 
    817 	adcx	$acc2, $acc1
    818 	adox	$acc2, $acc2
    819 	adc	\$0, $acc2
    820 
    821 	########################################################################
    822 	# Fourth reduction step
    823 	add	$t0, $acc4
    824 	adc	$t1, $acc5
    825 
    826 	mulx	$poly3, $t0, $t1
    827 	 mov	$acc4, $t2
    828 	mov	.Lpoly+8*1(%rip), $poly1
    829 	adc	$t0, $acc0
    830 	 mov	$acc5, $t3
    831 	adc	$t1, $acc1
    832 	adc	\$0, $acc2
    833 
    834 	########################################################################
    835 	# Branch-less conditional subtraction of P
    836 	xor	%eax, %eax
    837 	 mov	$acc0, $t0
    838 	sbb	\$-1, $acc4		# .Lpoly[0]
    839 	sbb	$poly1, $acc5		# .Lpoly[1]
    840 	sbb	\$0, $acc0		# .Lpoly[2]
    841 	 mov	$acc1, $t1
    842 	sbb	$poly3, $acc1		# .Lpoly[3]
    843 	sbb	\$0, $acc2
    844 
    845 	cmovc	$t2, $acc4
    846 	cmovc	$t3, $acc5
    847 	mov	$acc4, 8*0($r_ptr)
    848 	cmovc	$t0, $acc0
    849 	mov	$acc5, 8*1($r_ptr)
    850 	cmovc	$t1, $acc1
    851 	mov	$acc0, 8*2($r_ptr)
    852 	mov	$acc1, 8*3($r_ptr)
    853 
    854 	ret
    855 .size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
    856 
    857 .type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
    858 .align	32
    859 __ecp_nistz256_sqr_montx:
    860 	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
    861 	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
    862 	xor	%eax, %eax
    863 	adc	$t0, $acc2
    864 	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
    865 	 mov	$acc6, %rdx
    866 	adc	$t1, $acc3
    867 	adc	\$0, $acc4
    868 	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
    869 
    870 	#################################
    871 	mulx	$acc7, $t0, $t1		# a[1]*a[2]
    872 	adcx	$t0, $acc3
    873 	adox	$t1, $acc4
    874 
    875 	mulx	$acc0, $t0, $t1		# a[1]*a[3]
    876 	 mov	$acc7, %rdx
    877 	adcx	$t0, $acc4
    878 	adox	$t1, $acc5
    879 	adc	\$0, $acc5
    880 
    881 	#################################
    882 	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
    883 	 mov	8*0+128($a_ptr), %rdx
    884 	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
    885 	 adcx	$acc1, $acc1		# acc1:6<<1
    886 	adox	$t0, $acc5
    887 	 adcx	$acc2, $acc2
    888 	adox	$acc7, $acc6		# of=0
    889 
    890 	mulx	%rdx, $acc0, $t1
    891 	mov	8*1+128($a_ptr), %rdx
    892 	 adcx	$acc3, $acc3
    893 	adox	$t1, $acc1
    894 	 adcx	$acc4, $acc4
    895 	mulx	%rdx, $t0, $t4
    896 	mov	8*2+128($a_ptr), %rdx
    897 	 adcx	$acc5, $acc5
    898 	adox	$t0, $acc2
    899 	 adcx	$acc6, $acc6
    900 	.byte	0x67
    901 	mulx	%rdx, $t0, $t1
    902 	mov	8*3+128($a_ptr), %rdx
    903 	adox	$t4, $acc3
    904 	 adcx	$acc7, $acc7
    905 	adox	$t0, $acc4
    906 	 mov	\$32, $a_ptr
    907 	adox	$t1, $acc5
    908 	.byte	0x67,0x67
    909 	mulx	%rdx, $t0, $t4
    910 	 mov	$acc0, %rdx
    911 	adox	$t0, $acc6
    912 	 shlx	$a_ptr, $acc0, $t0
    913 	adox	$t4, $acc7
    914 	 shrx	$a_ptr, $acc0, $t4
    915 	 mov	.Lpoly+8*3(%rip), $t1
    916 
    917 	# reduction step 1
    918 	add	$t0, $acc1
    919 	adc	$t4, $acc2
    920 
    921 	mulx	$t1, $t0, $acc0
    922 	 mov	$acc1, %rdx
    923 	adc	$t0, $acc3
    924 	 shlx	$a_ptr, $acc1, $t0
    925 	adc	\$0, $acc0
    926 	 shrx	$a_ptr, $acc1, $t4
    927 
    928 	# reduction step 2
    929 	add	$t0, $acc2
    930 	adc	$t4, $acc3
    931 
    932 	mulx	$t1, $t0, $acc1
    933 	 mov	$acc2, %rdx
    934 	adc	$t0, $acc0
    935 	 shlx	$a_ptr, $acc2, $t0
    936 	adc	\$0, $acc1
    937 	 shrx	$a_ptr, $acc2, $t4
    938 
    939 	# reduction step 3
    940 	add	$t0, $acc3
    941 	adc	$t4, $acc0
    942 
    943 	mulx	$t1, $t0, $acc2
    944 	 mov	$acc3, %rdx
    945 	adc	$t0, $acc1
    946 	 shlx	$a_ptr, $acc3, $t0
    947 	adc	\$0, $acc2
    948 	 shrx	$a_ptr, $acc3, $t4
    949 
    950 	# reduction step 4
    951 	add	$t0, $acc0
    952 	adc	$t4, $acc1
    953 
    954 	mulx	$t1, $t0, $acc3
    955 	adc	$t0, $acc2
    956 	adc	\$0, $acc3
    957 
    958 	xor	$t3, $t3		# cf=0
    959 	adc	$acc0, $acc4		# accumulate upper half
    960 	 mov	.Lpoly+8*1(%rip), $a_ptr
    961 	adc	$acc1, $acc5
    962 	 mov	$acc4, $acc0
    963 	adc	$acc2, $acc6
    964 	adc	$acc3, $acc7
    965 	 mov	$acc5, $acc1
    966 	adc	\$0, $t3
    967 
    968 	xor	%eax, %eax		# cf=0
    969 	sbb	\$-1, $acc4		# .Lpoly[0]
    970 	 mov	$acc6, $acc2
    971 	sbb	$a_ptr, $acc5		# .Lpoly[1]
    972 	sbb	\$0, $acc6		# .Lpoly[2]
    973 	 mov	$acc7, $acc3
    974 	sbb	$t1, $acc7		# .Lpoly[3]
    975 	sbb	\$0, $t3
    976 
    977 	cmovc	$acc0, $acc4
    978 	cmovc	$acc1, $acc5
    979 	mov	$acc4, 8*0($r_ptr)
    980 	cmovc	$acc2, $acc6
    981 	mov	$acc5, 8*1($r_ptr)
    982 	cmovc	$acc3, $acc7
    983 	mov	$acc6, 8*2($r_ptr)
    984 	mov	$acc7, 8*3($r_ptr)
    985 
    986 	ret
    987 .size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
    988 ___
    989 }
    990 }
    991 {
    992 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
    993 my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
    994 my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
    995 
    996 $code.=<<___;
    997 ################################################################################
    998 # void ecp_nistz256_from_mont(
    999 #   uint64_t res[4],
   1000 #   uint64_t in[4]);
   1001 # This one performs Montgomery multiplication by 1, so we only need the reduction
   1002 
   1003 .globl	ecp_nistz256_from_mont
   1004 .type	ecp_nistz256_from_mont,\@function,2
   1005 .align	32
   1006 ecp_nistz256_from_mont:
   1007 	push	%r12
   1008 	push	%r13
   1009 
   1010 	mov	8*0($in_ptr), %rax
   1011 	mov	.Lpoly+8*3(%rip), $t2
   1012 	mov	8*1($in_ptr), $acc1
   1013 	mov	8*2($in_ptr), $acc2
   1014 	mov	8*3($in_ptr), $acc3
   1015 	mov	%rax, $acc0
   1016 	mov	.Lpoly+8*1(%rip), $t1
   1017 
   1018 	#########################################
   1019 	# First iteration
   1020 	mov	%rax, $t0
   1021 	shl	\$32, $acc0
   1022 	mulq	$t2
   1023 	shr	\$32, $t0
   1024 	add	$acc0, $acc1
   1025 	adc	$t0, $acc2
   1026 	adc	%rax, $acc3
   1027 	 mov	$acc1, %rax
   1028 	adc	\$0, %rdx
   1029 
   1030 	#########################################
   1031 	# Second iteration
   1032 	mov	$acc1, $t0
   1033 	shl	\$32, $acc1
   1034 	mov	%rdx, $acc0
   1035 	mulq	$t2
   1036 	shr	\$32, $t0
   1037 	add	$acc1, $acc2
   1038 	adc	$t0, $acc3
   1039 	adc	%rax, $acc0
   1040 	 mov	$acc2, %rax
   1041 	adc	\$0, %rdx
   1042 
   1043 	##########################################
   1044 	# Third iteration
   1045 	mov	$acc2, $t0
   1046 	shl	\$32, $acc2
   1047 	mov	%rdx, $acc1
   1048 	mulq	$t2
   1049 	shr	\$32, $t0
   1050 	add	$acc2, $acc3
   1051 	adc	$t0, $acc0
   1052 	adc	%rax, $acc1
   1053 	 mov	$acc3, %rax
   1054 	adc	\$0, %rdx
   1055 
   1056 	###########################################
   1057 	# Last iteration
   1058 	mov	$acc3, $t0
   1059 	shl	\$32, $acc3
   1060 	mov	%rdx, $acc2
   1061 	mulq	$t2
   1062 	shr	\$32, $t0
   1063 	add	$acc3, $acc0
   1064 	adc	$t0, $acc1
   1065 	 mov	$acc0, $t0
   1066 	adc	%rax, $acc2
   1067 	 mov	$acc1, $in_ptr
   1068 	adc	\$0, %rdx
   1069 
   1070 	sub	\$-1, $acc0
   1071 	 mov	$acc2, %rax
   1072 	sbb	$t1, $acc1
   1073 	sbb	\$0, $acc2
   1074 	 mov	%rdx, $acc3
   1075 	sbb	$t2, %rdx
   1076 	sbb	$t2, $t2
   1077 
   1078 	cmovnz	$t0, $acc0
   1079 	cmovnz	$in_ptr, $acc1
   1080 	mov	$acc0, 8*0($r_ptr)
   1081 	cmovnz	%rax, $acc2
   1082 	mov	$acc1, 8*1($r_ptr)
   1083 	cmovz	%rdx, $acc3
   1084 	mov	$acc2, 8*2($r_ptr)
   1085 	mov	$acc3, 8*3($r_ptr)
   1086 
   1087 	pop	%r13
   1088 	pop	%r12
   1089 	ret
   1090 .size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
   1091 ___
   1092 }
   1093 {
   1094 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   1095 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
   1096 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
   1097 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
   1098 
   1099 $code.=<<___;
   1100 ################################################################################
   1101 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
   1102 .globl	ecp_nistz256_select_w5
   1103 .type	ecp_nistz256_select_w5,\@abi-omnipotent
   1104 .align	32
   1105 ecp_nistz256_select_w5:
   1106 ___
   1107 $code.=<<___	if ($avx>1);
   1108 	mov	OPENSSL_ia32cap_P+8(%rip), %eax
   1109 	test	\$`1<<5`, %eax
   1110 	jnz	.Lavx2_select_w5
   1111 ___
   1112 $code.=<<___	if ($win64);
   1113 	lea	-0x88(%rsp), %rax
   1114 .LSEH_begin_ecp_nistz256_select_w5:
   1115 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   1116 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
   1117 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
   1118 	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
   1119 	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
   1120 	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
   1121 	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
   1122 	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
   1123 	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
   1124 	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
   1125 	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
   1126 ___
   1127 $code.=<<___;
   1128 	movdqa	.LOne(%rip), $ONE
   1129 	movd	$index, $INDEX
   1130 
   1131 	pxor	$Ra, $Ra
   1132 	pxor	$Rb, $Rb
   1133 	pxor	$Rc, $Rc
   1134 	pxor	$Rd, $Rd
   1135 	pxor	$Re, $Re
   1136 	pxor	$Rf, $Rf
   1137 
   1138 	movdqa	$ONE, $M0
   1139 	pshufd	\$0, $INDEX, $INDEX
   1140 
   1141 	mov	\$16, %rax
   1142 .Lselect_loop_sse_w5:
   1143 
   1144 	movdqa	$M0, $TMP0
   1145 	paddd	$ONE, $M0
   1146 	pcmpeqd $INDEX, $TMP0
   1147 
   1148 	movdqa	16*0($in_t), $T0a
   1149 	movdqa	16*1($in_t), $T0b
   1150 	movdqa	16*2($in_t), $T0c
   1151 	movdqa	16*3($in_t), $T0d
   1152 	movdqa	16*4($in_t), $T0e
   1153 	movdqa	16*5($in_t), $T0f
   1154 	lea 16*6($in_t), $in_t
   1155 
   1156 	pand	$TMP0, $T0a
   1157 	pand	$TMP0, $T0b
   1158 	por	$T0a, $Ra
   1159 	pand	$TMP0, $T0c
   1160 	por	$T0b, $Rb
   1161 	pand	$TMP0, $T0d
   1162 	por	$T0c, $Rc
   1163 	pand	$TMP0, $T0e
   1164 	por	$T0d, $Rd
   1165 	pand	$TMP0, $T0f
   1166 	por	$T0e, $Re
   1167 	por	$T0f, $Rf
   1168 
   1169 	dec	%rax
   1170 	jnz	.Lselect_loop_sse_w5
   1171 
   1172 	movdqu	$Ra, 16*0($val)
   1173 	movdqu	$Rb, 16*1($val)
   1174 	movdqu	$Rc, 16*2($val)
   1175 	movdqu	$Rd, 16*3($val)
   1176 	movdqu	$Re, 16*4($val)
   1177 	movdqu	$Rf, 16*5($val)
   1178 ___
   1179 $code.=<<___	if ($win64);
   1180 	movaps	(%rsp), %xmm6
   1181 	movaps	0x10(%rsp), %xmm7
   1182 	movaps	0x20(%rsp), %xmm8
   1183 	movaps	0x30(%rsp), %xmm9
   1184 	movaps	0x40(%rsp), %xmm10
   1185 	movaps	0x50(%rsp), %xmm11
   1186 	movaps	0x60(%rsp), %xmm12
   1187 	movaps	0x70(%rsp), %xmm13
   1188 	movaps	0x80(%rsp), %xmm14
   1189 	movaps	0x90(%rsp), %xmm15
   1190 	lea	0xa8(%rsp), %rsp
   1191 .LSEH_end_ecp_nistz256_select_w5:
   1192 ___
   1193 $code.=<<___;
   1194 	ret
   1195 .size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
   1196 
   1197 ################################################################################
   1198 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
   1199 .globl	ecp_nistz256_select_w7
   1200 .type	ecp_nistz256_select_w7,\@abi-omnipotent
   1201 .align	32
   1202 ecp_nistz256_select_w7:
   1203 ___
   1204 $code.=<<___	if ($avx>1);
   1205 	mov	OPENSSL_ia32cap_P+8(%rip), %eax
   1206 	test	\$`1<<5`, %eax
   1207 	jnz	.Lavx2_select_w7
   1208 ___
   1209 $code.=<<___	if ($win64);
   1210 	lea	-0x88(%rsp), %rax
   1211 .LSEH_begin_ecp_nistz256_select_w7:
   1212 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   1213 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
   1214 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
   1215 	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
   1216 	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
   1217 	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
   1218 	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
   1219 	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
   1220 	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
   1221 	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
   1222 	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
   1223 ___
   1224 $code.=<<___;
   1225 	movdqa	.LOne(%rip), $M0
   1226 	movd	$index, $INDEX
   1227 
   1228 	pxor	$Ra, $Ra
   1229 	pxor	$Rb, $Rb
   1230 	pxor	$Rc, $Rc
   1231 	pxor	$Rd, $Rd
   1232 
   1233 	movdqa	$M0, $ONE
   1234 	pshufd	\$0, $INDEX, $INDEX
   1235 	mov	\$64, %rax
   1236 
   1237 .Lselect_loop_sse_w7:
   1238 	movdqa	$M0, $TMP0
   1239 	paddd	$ONE, $M0
   1240 	movdqa	16*0($in_t), $T0a
   1241 	movdqa	16*1($in_t), $T0b
   1242 	pcmpeqd	$INDEX, $TMP0
   1243 	movdqa	16*2($in_t), $T0c
   1244 	movdqa	16*3($in_t), $T0d
   1245 	lea	16*4($in_t), $in_t
   1246 
   1247 	pand	$TMP0, $T0a
   1248 	pand	$TMP0, $T0b
   1249 	por	$T0a, $Ra
   1250 	pand	$TMP0, $T0c
   1251 	por	$T0b, $Rb
   1252 	pand	$TMP0, $T0d
   1253 	por	$T0c, $Rc
   1254 	prefetcht0	255($in_t)
   1255 	por	$T0d, $Rd
   1256 
   1257 	dec	%rax
   1258 	jnz	.Lselect_loop_sse_w7
   1259 
   1260 	movdqu	$Ra, 16*0($val)
   1261 	movdqu	$Rb, 16*1($val)
   1262 	movdqu	$Rc, 16*2($val)
   1263 	movdqu	$Rd, 16*3($val)
   1264 ___
   1265 $code.=<<___	if ($win64);
   1266 	movaps	(%rsp), %xmm6
   1267 	movaps	0x10(%rsp), %xmm7
   1268 	movaps	0x20(%rsp), %xmm8
   1269 	movaps	0x30(%rsp), %xmm9
   1270 	movaps	0x40(%rsp), %xmm10
   1271 	movaps	0x50(%rsp), %xmm11
   1272 	movaps	0x60(%rsp), %xmm12
   1273 	movaps	0x70(%rsp), %xmm13
   1274 	movaps	0x80(%rsp), %xmm14
   1275 	movaps	0x90(%rsp), %xmm15
   1276 	lea	0xa8(%rsp), %rsp
   1277 .LSEH_end_ecp_nistz256_select_w7:
   1278 ___
   1279 $code.=<<___;
   1280 	ret
   1281 .size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
   1282 ___
   1283 }
   1284 if ($avx>1) {
   1285 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   1286 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
   1287 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
   1288 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
   1289 
   1290 $code.=<<___;
   1291 ################################################################################
   1292 # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
   1293 .type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
   1294 .align	32
   1295 ecp_nistz256_avx2_select_w5:
   1296 .Lavx2_select_w5:
   1297 	vzeroupper
   1298 ___
   1299 $code.=<<___	if ($win64);
   1300 	lea	-0x88(%rsp), %rax
   1301 .LSEH_begin_ecp_nistz256_avx2_select_w5:
   1302 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   1303 	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
   1304 	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
   1305 	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
   1306 	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
   1307 	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
   1308 	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
   1309 	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
   1310 	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
   1311 	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
   1312 	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
   1313 ___
   1314 $code.=<<___;
   1315 	vmovdqa	.LTwo(%rip), $TWO
   1316 
   1317 	vpxor	$Ra, $Ra, $Ra
   1318 	vpxor	$Rb, $Rb, $Rb
   1319 	vpxor	$Rc, $Rc, $Rc
   1320 
   1321 	vmovdqa .LOne(%rip), $M0
   1322 	vmovdqa .LTwo(%rip), $M1
   1323 
   1324 	vmovd	$index, %xmm1
   1325 	vpermd	$INDEX, $Ra, $INDEX
   1326 
   1327 	mov	\$8, %rax
   1328 .Lselect_loop_avx2_w5:
   1329 
   1330 	vmovdqa	32*0($in_t), $T0a
   1331 	vmovdqa	32*1($in_t), $T0b
   1332 	vmovdqa	32*2($in_t), $T0c
   1333 
   1334 	vmovdqa	32*3($in_t), $T1a
   1335 	vmovdqa	32*4($in_t), $T1b
   1336 	vmovdqa	32*5($in_t), $T1c
   1337 
   1338 	vpcmpeqd	$INDEX, $M0, $TMP0
   1339 	vpcmpeqd	$INDEX, $M1, $TMP1
   1340 
   1341 	vpaddd	$TWO, $M0, $M0
   1342 	vpaddd	$TWO, $M1, $M1
   1343 	lea	32*6($in_t), $in_t
   1344 
   1345 	vpand	$TMP0, $T0a, $T0a
   1346 	vpand	$TMP0, $T0b, $T0b
   1347 	vpand	$TMP0, $T0c, $T0c
   1348 	vpand	$TMP1, $T1a, $T1a
   1349 	vpand	$TMP1, $T1b, $T1b
   1350 	vpand	$TMP1, $T1c, $T1c
   1351 
   1352 	vpxor	$T0a, $Ra, $Ra
   1353 	vpxor	$T0b, $Rb, $Rb
   1354 	vpxor	$T0c, $Rc, $Rc
   1355 	vpxor	$T1a, $Ra, $Ra
   1356 	vpxor	$T1b, $Rb, $Rb
   1357 	vpxor	$T1c, $Rc, $Rc
   1358 
   1359 	dec %rax
   1360 	jnz .Lselect_loop_avx2_w5
   1361 
   1362 	vmovdqu $Ra, 32*0($val)
   1363 	vmovdqu $Rb, 32*1($val)
   1364 	vmovdqu $Rc, 32*2($val)
   1365 	vzeroupper
   1366 ___
   1367 $code.=<<___	if ($win64);
   1368 	movaps	(%rsp), %xmm6
   1369 	movaps	0x10(%rsp), %xmm7
   1370 	movaps	0x20(%rsp), %xmm8
   1371 	movaps	0x30(%rsp), %xmm9
   1372 	movaps	0x40(%rsp), %xmm10
   1373 	movaps	0x50(%rsp), %xmm11
   1374 	movaps	0x60(%rsp), %xmm12
   1375 	movaps	0x70(%rsp), %xmm13
   1376 	movaps	0x80(%rsp), %xmm14
   1377 	movaps	0x90(%rsp), %xmm15
   1378 	lea	0xa8(%rsp), %rsp
   1379 .LSEH_end_ecp_nistz256_avx2_select_w5:
   1380 ___
   1381 $code.=<<___;
   1382 	ret
   1383 .size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
   1384 ___
   1385 }
   1386 if ($avx>1) {
   1387 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   1388 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
   1389 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
   1390 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
   1391 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
   1392 
   1393 $code.=<<___;
   1394 
   1395 ################################################################################
   1396 # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
   1397 .globl	ecp_nistz256_avx2_select_w7
   1398 .type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
   1399 .align	32
   1400 ecp_nistz256_avx2_select_w7:
   1401 .Lavx2_select_w7:
   1402 	vzeroupper
   1403 ___
   1404 $code.=<<___	if ($win64);
   1405 	lea	-0x88(%rsp), %rax
   1406 .LSEH_begin_ecp_nistz256_avx2_select_w7:
   1407 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   1408 	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
   1409 	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
   1410 	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
   1411 	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
   1412 	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
   1413 	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
   1414 	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
   1415 	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
   1416 	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
   1417 	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
   1418 ___
   1419 $code.=<<___;
   1420 	vmovdqa	.LThree(%rip), $THREE
   1421 
   1422 	vpxor	$Ra, $Ra, $Ra
   1423 	vpxor	$Rb, $Rb, $Rb
   1424 
   1425 	vmovdqa .LOne(%rip), $M0
   1426 	vmovdqa .LTwo(%rip), $M1
   1427 	vmovdqa .LThree(%rip), $M2
   1428 
   1429 	vmovd	$index, %xmm1
   1430 	vpermd	$INDEX, $Ra, $INDEX
   1431 	# Skip index = 0, because it is implicitly the point at infinity
   1432 
   1433 	mov	\$21, %rax
   1434 .Lselect_loop_avx2_w7:
   1435 
   1436 	vmovdqa	32*0($in_t), $T0a
   1437 	vmovdqa	32*1($in_t), $T0b
   1438 
   1439 	vmovdqa	32*2($in_t), $T1a
   1440 	vmovdqa	32*3($in_t), $T1b
   1441 
   1442 	vmovdqa	32*4($in_t), $T2a
   1443 	vmovdqa	32*5($in_t), $T2b
   1444 
   1445 	vpcmpeqd	$INDEX, $M0, $TMP0
   1446 	vpcmpeqd	$INDEX, $M1, $TMP1
   1447 	vpcmpeqd	$INDEX, $M2, $TMP2
   1448 
   1449 	vpaddd	$THREE, $M0, $M0
   1450 	vpaddd	$THREE, $M1, $M1
   1451 	vpaddd	$THREE, $M2, $M2
   1452 	lea	32*6($in_t), $in_t
   1453 
   1454 	vpand	$TMP0, $T0a, $T0a
   1455 	vpand	$TMP0, $T0b, $T0b
   1456 	vpand	$TMP1, $T1a, $T1a
   1457 	vpand	$TMP1, $T1b, $T1b
   1458 	vpand	$TMP2, $T2a, $T2a
   1459 	vpand	$TMP2, $T2b, $T2b
   1460 
   1461 	vpxor	$T0a, $Ra, $Ra
   1462 	vpxor	$T0b, $Rb, $Rb
   1463 	vpxor	$T1a, $Ra, $Ra
   1464 	vpxor	$T1b, $Rb, $Rb
   1465 	vpxor	$T2a, $Ra, $Ra
   1466 	vpxor	$T2b, $Rb, $Rb
   1467 
   1468 	dec %rax
   1469 	jnz .Lselect_loop_avx2_w7
   1470 
   1471 
   1472 	vmovdqa	32*0($in_t), $T0a
   1473 	vmovdqa	32*1($in_t), $T0b
   1474 
   1475 	vpcmpeqd	$INDEX, $M0, $TMP0
   1476 
   1477 	vpand	$TMP0, $T0a, $T0a
   1478 	vpand	$TMP0, $T0b, $T0b
   1479 
   1480 	vpxor	$T0a, $Ra, $Ra
   1481 	vpxor	$T0b, $Rb, $Rb
   1482 
   1483 	vmovdqu $Ra, 32*0($val)
   1484 	vmovdqu $Rb, 32*1($val)
   1485 	vzeroupper
   1486 ___
   1487 $code.=<<___	if ($win64);
   1488 	movaps	(%rsp), %xmm6
   1489 	movaps	0x10(%rsp), %xmm7
   1490 	movaps	0x20(%rsp), %xmm8
   1491 	movaps	0x30(%rsp), %xmm9
   1492 	movaps	0x40(%rsp), %xmm10
   1493 	movaps	0x50(%rsp), %xmm11
   1494 	movaps	0x60(%rsp), %xmm12
   1495 	movaps	0x70(%rsp), %xmm13
   1496 	movaps	0x80(%rsp), %xmm14
   1497 	movaps	0x90(%rsp), %xmm15
   1498 	lea	0xa8(%rsp), %rsp
   1499 .LSEH_end_ecp_nistz256_avx2_select_w7:
   1500 ___
   1501 $code.=<<___;
   1502 	ret
   1503 .size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
   1504 ___
   1505 } else {
   1506 $code.=<<___;
   1507 .globl	ecp_nistz256_avx2_select_w7
   1508 .type	ecp_nistz256_avx2_select_w7,\@function,3
   1509 .align	32
   1510 ecp_nistz256_avx2_select_w7:
   1511 	.byte	0x0f,0x0b	# ud2
   1512 	ret
   1513 .size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
   1514 ___
   1515 }
   1516 {{{
   1517 ########################################################################
   1518 # This block implements higher level point_double, point_add and
   1519 # point_add_affine. The key to performance in this case is to allow
   1520 # out-of-order execution logic to overlap computations from next step
   1521 # with tail processing from current step. By using tailored calling
   1522 # sequence we minimize inter-step overhead to give processor better
   1523 # shot at overlapping operations...
   1524 #
   1525 # You will notice that input data is copied to stack. Trouble is that
   1526 # there are no registers to spare for holding original pointers and
   1527 # reloading them, pointers, would create undesired dependencies on
   1528 # effective addresses calculation paths. In other words it's too done
   1529 # to favour out-of-order execution logic.
   1530 #						<appro (at] openssl.org>
   1531 
   1532 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
   1533 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
   1534 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
   1535 my ($poly1,$poly3)=($acc6,$acc7);
   1536 
   1537 sub load_for_mul () {
   1538 my ($a,$b,$src0) = @_;
   1539 my $bias = $src0 eq "%rax" ? 0 : -128;
   1540 
   1541 "	mov	$b, $src0
   1542 	lea	$b, $b_ptr
   1543 	mov	8*0+$a, $acc1
   1544 	mov	8*1+$a, $acc2
   1545 	lea	$bias+$a, $a_ptr
   1546 	mov	8*2+$a, $acc3
   1547 	mov	8*3+$a, $acc4"
   1548 }
   1549 
   1550 sub load_for_sqr () {
   1551 my ($a,$src0) = @_;
   1552 my $bias = $src0 eq "%rax" ? 0 : -128;
   1553 
   1554 "	mov	8*0+$a, $src0
   1555 	mov	8*1+$a, $acc6
   1556 	lea	$bias+$a, $a_ptr
   1557 	mov	8*2+$a, $acc7
   1558 	mov	8*3+$a, $acc0"
   1559 }
   1560 
   1561 									{
   1562 ########################################################################
   1563 # operate in 4-5-0-1 "name space" that matches multiplication output
   1564 #
   1565 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   1566 
   1567 $code.=<<___;
   1568 .type	__ecp_nistz256_add_toq,\@abi-omnipotent
   1569 .align	32
   1570 __ecp_nistz256_add_toq:
   1571 	add	8*0($b_ptr), $a0
   1572 	adc	8*1($b_ptr), $a1
   1573 	 mov	$a0, $t0
   1574 	adc	8*2($b_ptr), $a2
   1575 	adc	8*3($b_ptr), $a3
   1576 	 mov	$a1, $t1
   1577 	sbb	$t4, $t4
   1578 
   1579 	sub	\$-1, $a0
   1580 	 mov	$a2, $t2
   1581 	sbb	$poly1, $a1
   1582 	sbb	\$0, $a2
   1583 	 mov	$a3, $t3
   1584 	sbb	$poly3, $a3
   1585 	test	$t4, $t4
   1586 
   1587 	cmovz	$t0, $a0
   1588 	cmovz	$t1, $a1
   1589 	mov	$a0, 8*0($r_ptr)
   1590 	cmovz	$t2, $a2
   1591 	mov	$a1, 8*1($r_ptr)
   1592 	cmovz	$t3, $a3
   1593 	mov	$a2, 8*2($r_ptr)
   1594 	mov	$a3, 8*3($r_ptr)
   1595 
   1596 	ret
   1597 .size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
   1598 
   1599 .type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
   1600 .align	32
   1601 __ecp_nistz256_sub_fromq:
   1602 	sub	8*0($b_ptr), $a0
   1603 	sbb	8*1($b_ptr), $a1
   1604 	 mov	$a0, $t0
   1605 	sbb	8*2($b_ptr), $a2
   1606 	sbb	8*3($b_ptr), $a3
   1607 	 mov	$a1, $t1
   1608 	sbb	$t4, $t4
   1609 
   1610 	add	\$-1, $a0
   1611 	 mov	$a2, $t2
   1612 	adc	$poly1, $a1
   1613 	adc	\$0, $a2
   1614 	 mov	$a3, $t3
   1615 	adc	$poly3, $a3
   1616 	test	$t4, $t4
   1617 
   1618 	cmovz	$t0, $a0
   1619 	cmovz	$t1, $a1
   1620 	mov	$a0, 8*0($r_ptr)
   1621 	cmovz	$t2, $a2
   1622 	mov	$a1, 8*1($r_ptr)
   1623 	cmovz	$t3, $a3
   1624 	mov	$a2, 8*2($r_ptr)
   1625 	mov	$a3, 8*3($r_ptr)
   1626 
   1627 	ret
   1628 .size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
   1629 
   1630 .type	__ecp_nistz256_subq,\@abi-omnipotent
   1631 .align	32
   1632 __ecp_nistz256_subq:
   1633 	sub	$a0, $t0
   1634 	sbb	$a1, $t1
   1635 	 mov	$t0, $a0
   1636 	sbb	$a2, $t2
   1637 	sbb	$a3, $t3
   1638 	 mov	$t1, $a1
   1639 	sbb	$t4, $t4
   1640 
   1641 	add	\$-1, $t0
   1642 	 mov	$t2, $a2
   1643 	adc	$poly1, $t1
   1644 	adc	\$0, $t2
   1645 	 mov	$t3, $a3
   1646 	adc	$poly3, $t3
   1647 	test	$t4, $t4
   1648 
   1649 	cmovnz	$t0, $a0
   1650 	cmovnz	$t1, $a1
   1651 	cmovnz	$t2, $a2
   1652 	cmovnz	$t3, $a3
   1653 
   1654 	ret
   1655 .size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
   1656 
   1657 .type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
   1658 .align	32
   1659 __ecp_nistz256_mul_by_2q:
   1660 	add	$a0, $a0		# a0:a3+a0:a3
   1661 	adc	$a1, $a1
   1662 	 mov	$a0, $t0
   1663 	adc	$a2, $a2
   1664 	adc	$a3, $a3
   1665 	 mov	$a1, $t1
   1666 	sbb	$t4, $t4
   1667 
   1668 	sub	\$-1, $a0
   1669 	 mov	$a2, $t2
   1670 	sbb	$poly1, $a1
   1671 	sbb	\$0, $a2
   1672 	 mov	$a3, $t3
   1673 	sbb	$poly3, $a3
   1674 	test	$t4, $t4
   1675 
   1676 	cmovz	$t0, $a0
   1677 	cmovz	$t1, $a1
   1678 	mov	$a0, 8*0($r_ptr)
   1679 	cmovz	$t2, $a2
   1680 	mov	$a1, 8*1($r_ptr)
   1681 	cmovz	$t3, $a3
   1682 	mov	$a2, 8*2($r_ptr)
   1683 	mov	$a3, 8*3($r_ptr)
   1684 
   1685 	ret
   1686 .size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
   1687 ___
   1688 									}
   1689 sub gen_double () {
   1690     my $x = shift;
   1691     my ($src0,$sfx,$bias);
   1692     my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
   1693 
   1694     if ($x ne "x") {
   1695 	$src0 = "%rax";
   1696 	$sfx  = "";
   1697 	$bias = 0;
   1698 
   1699 $code.=<<___;
   1700 .globl	ecp_nistz256_point_double
   1701 .type	ecp_nistz256_point_double,\@function,2
   1702 .align	32
   1703 ecp_nistz256_point_double:
   1704 ___
   1705 $code.=<<___	if ($addx);
   1706 	mov	\$0x80100, %ecx
   1707 	and	OPENSSL_ia32cap_P+8(%rip), %ecx
   1708 	cmp	\$0x80100, %ecx
   1709 	je	.Lpoint_doublex
   1710 ___
   1711     } else {
   1712 	$src0 = "%rdx";
   1713 	$sfx  = "x";
   1714 	$bias = 128;
   1715 
   1716 $code.=<<___;
   1717 .type	ecp_nistz256_point_doublex,\@function,2
   1718 .align	32
   1719 ecp_nistz256_point_doublex:
   1720 .Lpoint_doublex:
   1721 ___
   1722     }
   1723 $code.=<<___;
   1724 	push	%rbp
   1725 	push	%rbx
   1726 	push	%r12
   1727 	push	%r13
   1728 	push	%r14
   1729 	push	%r15
   1730 	sub	\$32*5+8, %rsp
   1731 
   1732 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
   1733 	mov	$a_ptr, $b_ptr			# backup copy
   1734 	movdqu	0x10($a_ptr), %xmm1
   1735 	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
   1736 	 mov	0x20+8*1($a_ptr), $acc5
   1737 	 mov	0x20+8*2($a_ptr), $acc0
   1738 	 mov	0x20+8*3($a_ptr), $acc1
   1739 	 mov	.Lpoly+8*1(%rip), $poly1
   1740 	 mov	.Lpoly+8*3(%rip), $poly3
   1741 	movdqa	%xmm0, $in_x(%rsp)
   1742 	movdqa	%xmm1, $in_x+0x10(%rsp)
   1743 	lea	0x20($r_ptr), $acc2
   1744 	lea	0x40($r_ptr), $acc3
   1745 	movq	$r_ptr, %xmm0
   1746 	movq	$acc2, %xmm1
   1747 	movq	$acc3, %xmm2
   1748 
   1749 	lea	$S(%rsp), $r_ptr
   1750 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
   1751 
   1752 	mov	0x40+8*0($a_ptr), $src0
   1753 	mov	0x40+8*1($a_ptr), $acc6
   1754 	mov	0x40+8*2($a_ptr), $acc7
   1755 	mov	0x40+8*3($a_ptr), $acc0
   1756 	lea	0x40-$bias($a_ptr), $a_ptr
   1757 	lea	$Zsqr(%rsp), $r_ptr
   1758 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
   1759 
   1760 	`&load_for_sqr("$S(%rsp)", "$src0")`
   1761 	lea	$S(%rsp), $r_ptr
   1762 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
   1763 
   1764 	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
   1765 	mov	0x40+8*0($b_ptr), $acc1
   1766 	mov	0x40+8*1($b_ptr), $acc2
   1767 	mov	0x40+8*2($b_ptr), $acc3
   1768 	mov	0x40+8*3($b_ptr), $acc4
   1769 	lea	0x40-$bias($b_ptr), $a_ptr
   1770 	lea	0x20($b_ptr), $b_ptr
   1771 	movq	%xmm2, $r_ptr
   1772 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
   1773 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
   1774 
   1775 	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
   1776 	mov	$in_x+8*1(%rsp), $acc5
   1777 	lea	$Zsqr(%rsp), $b_ptr
   1778 	mov	$in_x+8*2(%rsp), $acc0
   1779 	mov	$in_x+8*3(%rsp), $acc1
   1780 	lea	$M(%rsp), $r_ptr
   1781 	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
   1782 
   1783 	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
   1784 	mov	$in_x+8*1(%rsp), $acc5
   1785 	lea	$Zsqr(%rsp), $b_ptr
   1786 	mov	$in_x+8*2(%rsp), $acc0
   1787 	mov	$in_x+8*3(%rsp), $acc1
   1788 	lea	$Zsqr(%rsp), $r_ptr
   1789 	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
   1790 
   1791 	`&load_for_sqr("$S(%rsp)", "$src0")`
   1792 	movq	%xmm1, $r_ptr
   1793 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
   1794 ___
   1795 {
   1796 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
   1797 # operate in 4-5-6-7 "name space" that matches squaring output
   1798 #
   1799 my ($poly1,$poly3)=($a_ptr,$t1);
   1800 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
   1801 
   1802 $code.=<<___;
   1803 	xor	$t4, $t4
   1804 	mov	$a0, $t0
   1805 	add	\$-1, $a0
   1806 	mov	$a1, $t1
   1807 	adc	$poly1, $a1
   1808 	mov	$a2, $t2
   1809 	adc	\$0, $a2
   1810 	mov	$a3, $t3
   1811 	adc	$poly3, $a3
   1812 	adc	\$0, $t4
   1813 	xor	$a_ptr, $a_ptr		# borrow $a_ptr
   1814 	test	\$1, $t0
   1815 
   1816 	cmovz	$t0, $a0
   1817 	cmovz	$t1, $a1
   1818 	cmovz	$t2, $a2
   1819 	cmovz	$t3, $a3
   1820 	cmovz	$a_ptr, $t4
   1821 
   1822 	mov	$a1, $t0		# a0:a3>>1
   1823 	shr	\$1, $a0
   1824 	shl	\$63, $t0
   1825 	mov	$a2, $t1
   1826 	shr	\$1, $a1
   1827 	or	$t0, $a0
   1828 	shl	\$63, $t1
   1829 	mov	$a3, $t2
   1830 	shr	\$1, $a2
   1831 	or	$t1, $a1
   1832 	shl	\$63, $t2
   1833 	mov	$a0, 8*0($r_ptr)
   1834 	shr	\$1, $a3
   1835 	mov	$a1, 8*1($r_ptr)
   1836 	shl	\$63, $t4
   1837 	or	$t2, $a2
   1838 	or	$t4, $a3
   1839 	mov	$a2, 8*2($r_ptr)
   1840 	mov	$a3, 8*3($r_ptr)
   1841 ___
   1842 }
   1843 $code.=<<___;
   1844 	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
   1845 	lea	$M(%rsp), $r_ptr
   1846 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
   1847 
   1848 	lea	$tmp0(%rsp), $r_ptr
   1849 	call	__ecp_nistz256_mul_by_2$x
   1850 
   1851 	lea	$M(%rsp), $b_ptr
   1852 	lea	$M(%rsp), $r_ptr
   1853 	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
   1854 
   1855 	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
   1856 	lea	$S(%rsp), $r_ptr
   1857 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
   1858 
   1859 	lea	$tmp0(%rsp), $r_ptr
   1860 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
   1861 
   1862 	`&load_for_sqr("$M(%rsp)", "$src0")`
   1863 	movq	%xmm0, $r_ptr
   1864 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
   1865 
   1866 	lea	$tmp0(%rsp), $b_ptr
   1867 	mov	$acc6, $acc0			# harmonize sqr output and sub input
   1868 	mov	$acc7, $acc1
   1869 	mov	$a_ptr, $poly1
   1870 	mov	$t1, $poly3
   1871 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
   1872 
   1873 	mov	$S+8*0(%rsp), $t0
   1874 	mov	$S+8*1(%rsp), $t1
   1875 	mov	$S+8*2(%rsp), $t2
   1876 	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
   1877 	lea	$S(%rsp), $r_ptr
   1878 	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
   1879 
   1880 	mov	$M(%rsp), $src0
   1881 	lea	$M(%rsp), $b_ptr
   1882 	mov	$acc4, $acc6			# harmonize sub output and mul input
   1883 	xor	%ecx, %ecx
   1884 	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
   1885 	mov	$acc5, $acc2
   1886 	mov	$acc5, $S+8*1(%rsp)
   1887 	cmovz	$acc0, $acc3
   1888 	mov	$acc0, $S+8*2(%rsp)
   1889 	lea	$S-$bias(%rsp), $a_ptr
   1890 	cmovz	$acc1, $acc4
   1891 	mov	$acc1, $S+8*3(%rsp)
   1892 	mov	$acc6, $acc1
   1893 	lea	$S(%rsp), $r_ptr
   1894 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
   1895 
   1896 	movq	%xmm1, $b_ptr
   1897 	movq	%xmm1, $r_ptr
   1898 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
   1899 
   1900 	add	\$32*5+8, %rsp
   1901 	pop	%r15
   1902 	pop	%r14
   1903 	pop	%r13
   1904 	pop	%r12
   1905 	pop	%rbx
   1906 	pop	%rbp
   1907 	ret
   1908 .size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
   1909 ___
   1910 }
   1911 &gen_double("q");
   1912 
   1913 sub gen_add () {
   1914     my $x = shift;
   1915     my ($src0,$sfx,$bias);
   1916     my ($H,$Hsqr,$R,$Rsqr,$Hcub,
   1917 	$U1,$U2,$S1,$S2,
   1918 	$res_x,$res_y,$res_z,
   1919 	$in1_x,$in1_y,$in1_z,
   1920 	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
   1921     my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
   1922 
   1923     if ($x ne "x") {
   1924 	$src0 = "%rax";
   1925 	$sfx  = "";
   1926 	$bias = 0;
   1927 
   1928 $code.=<<___;
   1929 .globl	ecp_nistz256_point_add
   1930 .type	ecp_nistz256_point_add,\@function,3
   1931 .align	32
   1932 ecp_nistz256_point_add:
   1933 ___
   1934 $code.=<<___	if ($addx);
   1935 	mov	\$0x80100, %ecx
   1936 	and	OPENSSL_ia32cap_P+8(%rip), %ecx
   1937 	cmp	\$0x80100, %ecx
   1938 	je	.Lpoint_addx
   1939 ___
   1940     } else {
   1941 	$src0 = "%rdx";
   1942 	$sfx  = "x";
   1943 	$bias = 128;
   1944 
   1945 $code.=<<___;
   1946 .type	ecp_nistz256_point_addx,\@function,3
   1947 .align	32
   1948 ecp_nistz256_point_addx:
   1949 .Lpoint_addx:
   1950 ___
   1951     }
   1952 $code.=<<___;
   1953 	push	%rbp
   1954 	push	%rbx
   1955 	push	%r12
   1956 	push	%r13
   1957 	push	%r14
   1958 	push	%r15
   1959 	sub	\$32*18+8, %rsp
   1960 
   1961 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
   1962 	movdqu	0x10($a_ptr), %xmm1
   1963 	movdqu	0x20($a_ptr), %xmm2
   1964 	movdqu	0x30($a_ptr), %xmm3
   1965 	movdqu	0x40($a_ptr), %xmm4
   1966 	movdqu	0x50($a_ptr), %xmm5
   1967 	mov	$a_ptr, $b_ptr			# reassign
   1968 	mov	$b_org, $a_ptr			# reassign
   1969 	movdqa	%xmm0, $in1_x(%rsp)
   1970 	movdqa	%xmm1, $in1_x+0x10(%rsp)
   1971 	por	%xmm0, %xmm1
   1972 	movdqa	%xmm2, $in1_y(%rsp)
   1973 	movdqa	%xmm3, $in1_y+0x10(%rsp)
   1974 	por	%xmm2, %xmm3
   1975 	movdqa	%xmm4, $in1_z(%rsp)
   1976 	movdqa	%xmm5, $in1_z+0x10(%rsp)
   1977 	por	%xmm1, %xmm3
   1978 
   1979 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
   1980 	 pshufd	\$0xb1, %xmm3, %xmm5
   1981 	movdqu	0x10($a_ptr), %xmm1
   1982 	movdqu	0x20($a_ptr), %xmm2
   1983 	 por	%xmm3, %xmm5
   1984 	movdqu	0x30($a_ptr), %xmm3
   1985 	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
   1986 	 mov	0x40+8*1($a_ptr), $acc6
   1987 	 mov	0x40+8*2($a_ptr), $acc7
   1988 	 mov	0x40+8*3($a_ptr), $acc0
   1989 	movdqa	%xmm0, $in2_x(%rsp)
   1990 	 pshufd	\$0x1e, %xmm5, %xmm4
   1991 	movdqa	%xmm1, $in2_x+0x10(%rsp)
   1992 	por	%xmm0, %xmm1
   1993 	 movq	$r_ptr, %xmm0			# save $r_ptr
   1994 	movdqa	%xmm2, $in2_y(%rsp)
   1995 	movdqa	%xmm3, $in2_y+0x10(%rsp)
   1996 	por	%xmm2, %xmm3
   1997 	 por	%xmm4, %xmm5
   1998 	 pxor	%xmm4, %xmm4
   1999 	por	%xmm1, %xmm3
   2000 
   2001 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
   2002 	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
   2003 	 mov	$acc6, $in2_z+8*1(%rsp)
   2004 	 mov	$acc7, $in2_z+8*2(%rsp)
   2005 	 mov	$acc0, $in2_z+8*3(%rsp)
   2006 	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
   2007 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
   2008 
   2009 	pcmpeqd	%xmm4, %xmm5
   2010 	pshufd	\$0xb1, %xmm3, %xmm4
   2011 	por	%xmm3, %xmm4
   2012 	pshufd	\$0, %xmm5, %xmm5		# in1infty
   2013 	pshufd	\$0x1e, %xmm4, %xmm3
   2014 	por	%xmm3, %xmm4
   2015 	pxor	%xmm3, %xmm3
   2016 	pcmpeqd	%xmm3, %xmm4
   2017 	pshufd	\$0, %xmm4, %xmm4		# in2infty
   2018 	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
   2019 	 mov	0x40+8*1($b_ptr), $acc6
   2020 	 mov	0x40+8*2($b_ptr), $acc7
   2021 	 mov	0x40+8*3($b_ptr), $acc0
   2022 
   2023 	lea	0x40-$bias($b_ptr), $a_ptr
   2024 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
   2025 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
   2026 
   2027 	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
   2028 	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
   2029 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
   2030 
   2031 	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
   2032 	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
   2033 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
   2034 
   2035 	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
   2036 	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
   2037 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
   2038 
   2039 	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
   2040 	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
   2041 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
   2042 
   2043 	lea	$S1(%rsp), $b_ptr
   2044 	lea	$R(%rsp), $r_ptr		# R = S2 - S1
   2045 	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
   2046 
   2047 	or	$acc5, $acc4			# see if result is zero
   2048 	movdqa	%xmm4, %xmm2
   2049 	or	$acc0, $acc4
   2050 	or	$acc1, $acc4
   2051 	por	%xmm5, %xmm2			# in1infty || in2infty
   2052 	movq	$acc4, %xmm3
   2053 
   2054 	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
   2055 	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
   2056 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
   2057 
   2058 	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
   2059 	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
   2060 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
   2061 
   2062 	lea	$U1(%rsp), $b_ptr
   2063 	lea	$H(%rsp), $r_ptr		# H = U2 - U1
   2064 	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
   2065 
   2066 	or	$acc5, $acc4			# see if result is zero
   2067 	or	$acc0, $acc4
   2068 	or	$acc1, $acc4
   2069 
   2070 	.byte	0x3e				# predict taken
   2071 	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
   2072 	movq	%xmm2, $acc0
   2073 	movq	%xmm3, $acc1
   2074 	test	$acc0, $acc0
   2075 	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
   2076 	test	$acc1, $acc1
   2077 	jz	.Ladd_proceed$x			# is_equal(S1,S2)?
   2078 
   2079 	movq	%xmm0, $r_ptr			# restore $r_ptr
   2080 	pxor	%xmm0, %xmm0
   2081 	movdqu	%xmm0, 0x00($r_ptr)
   2082 	movdqu	%xmm0, 0x10($r_ptr)
   2083 	movdqu	%xmm0, 0x20($r_ptr)
   2084 	movdqu	%xmm0, 0x30($r_ptr)
   2085 	movdqu	%xmm0, 0x40($r_ptr)
   2086 	movdqu	%xmm0, 0x50($r_ptr)
   2087 	jmp	.Ladd_done$x
   2088 
   2089 .align	32
   2090 .Ladd_proceed$x:
   2091 	`&load_for_sqr("$R(%rsp)", "$src0")`
   2092 	lea	$Rsqr(%rsp), $r_ptr		# R^2
   2093 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
   2094 
   2095 	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
   2096 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   2097 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
   2098 
   2099 	`&load_for_sqr("$H(%rsp)", "$src0")`
   2100 	lea	$Hsqr(%rsp), $r_ptr		# H^2
   2101 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
   2102 
   2103 	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
   2104 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   2105 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
   2106 
   2107 	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
   2108 	lea	$Hcub(%rsp), $r_ptr		# H^3
   2109 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
   2110 
   2111 	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
   2112 	lea	$U2(%rsp), $r_ptr		# U1*H^2
   2113 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
   2114 ___
   2115 {
   2116 #######################################################################
   2117 # operate in 4-5-0-1 "name space" that matches multiplication output
   2118 #
   2119 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   2120 my ($poly1, $poly3)=($acc6,$acc7);
   2121 
   2122 $code.=<<___;
   2123 	#lea	$U2(%rsp), $a_ptr
   2124 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
   2125 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
   2126 
   2127 	add	$acc0, $acc0		# a0:a3+a0:a3
   2128 	lea	$Rsqr(%rsp), $a_ptr
   2129 	adc	$acc1, $acc1
   2130 	 mov	$acc0, $t0
   2131 	adc	$acc2, $acc2
   2132 	adc	$acc3, $acc3
   2133 	 mov	$acc1, $t1
   2134 	sbb	$t4, $t4
   2135 
   2136 	sub	\$-1, $acc0
   2137 	 mov	$acc2, $t2
   2138 	sbb	$poly1, $acc1
   2139 	sbb	\$0, $acc2
   2140 	 mov	$acc3, $t3
   2141 	sbb	$poly3, $acc3
   2142 	test	$t4, $t4
   2143 
   2144 	cmovz	$t0, $acc0
   2145 	mov	8*0($a_ptr), $t0
   2146 	cmovz	$t1, $acc1
   2147 	mov	8*1($a_ptr), $t1
   2148 	cmovz	$t2, $acc2
   2149 	mov	8*2($a_ptr), $t2
   2150 	cmovz	$t3, $acc3
   2151 	mov	8*3($a_ptr), $t3
   2152 
   2153 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
   2154 
   2155 	lea	$Hcub(%rsp), $b_ptr
   2156 	lea	$res_x(%rsp), $r_ptr
   2157 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
   2158 
   2159 	mov	$U2+8*0(%rsp), $t0
   2160 	mov	$U2+8*1(%rsp), $t1
   2161 	mov	$U2+8*2(%rsp), $t2
   2162 	mov	$U2+8*3(%rsp), $t3
   2163 	lea	$res_y(%rsp), $r_ptr
   2164 
   2165 	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
   2166 
   2167 	mov	$acc0, 8*0($r_ptr)		# save the result, as
   2168 	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
   2169 	mov	$acc2, 8*2($r_ptr)
   2170 	mov	$acc3, 8*3($r_ptr)
   2171 ___
   2172 }
   2173 $code.=<<___;
   2174 	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
   2175 	lea	$S2(%rsp), $r_ptr
   2176 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
   2177 
   2178 	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
   2179 	lea	$res_y(%rsp), $r_ptr
   2180 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
   2181 
   2182 	lea	$S2(%rsp), $b_ptr
   2183 	lea	$res_y(%rsp), $r_ptr
   2184 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
   2185 
   2186 	movq	%xmm0, $r_ptr		# restore $r_ptr
   2187 
   2188 	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
   2189 	movdqa	%xmm5, %xmm1
   2190 	pandn	$res_z(%rsp), %xmm0
   2191 	movdqa	%xmm5, %xmm2
   2192 	pandn	$res_z+0x10(%rsp), %xmm1
   2193 	movdqa	%xmm5, %xmm3
   2194 	pand	$in2_z(%rsp), %xmm2
   2195 	pand	$in2_z+0x10(%rsp), %xmm3
   2196 	por	%xmm0, %xmm2
   2197 	por	%xmm1, %xmm3
   2198 
   2199 	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
   2200 	movdqa	%xmm4, %xmm1
   2201 	pandn	%xmm2, %xmm0
   2202 	movdqa	%xmm4, %xmm2
   2203 	pandn	%xmm3, %xmm1
   2204 	movdqa	%xmm4, %xmm3
   2205 	pand	$in1_z(%rsp), %xmm2
   2206 	pand	$in1_z+0x10(%rsp), %xmm3
   2207 	por	%xmm0, %xmm2
   2208 	por	%xmm1, %xmm3
   2209 	movdqu	%xmm2, 0x40($r_ptr)
   2210 	movdqu	%xmm3, 0x50($r_ptr)
   2211 
   2212 	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
   2213 	movdqa	%xmm5, %xmm1
   2214 	pandn	$res_x(%rsp), %xmm0
   2215 	movdqa	%xmm5, %xmm2
   2216 	pandn	$res_x+0x10(%rsp), %xmm1
   2217 	movdqa	%xmm5, %xmm3
   2218 	pand	$in2_x(%rsp), %xmm2
   2219 	pand	$in2_x+0x10(%rsp), %xmm3
   2220 	por	%xmm0, %xmm2
   2221 	por	%xmm1, %xmm3
   2222 
   2223 	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
   2224 	movdqa	%xmm4, %xmm1
   2225 	pandn	%xmm2, %xmm0
   2226 	movdqa	%xmm4, %xmm2
   2227 	pandn	%xmm3, %xmm1
   2228 	movdqa	%xmm4, %xmm3
   2229 	pand	$in1_x(%rsp), %xmm2
   2230 	pand	$in1_x+0x10(%rsp), %xmm3
   2231 	por	%xmm0, %xmm2
   2232 	por	%xmm1, %xmm3
   2233 	movdqu	%xmm2, 0x00($r_ptr)
   2234 	movdqu	%xmm3, 0x10($r_ptr)
   2235 
   2236 	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
   2237 	movdqa	%xmm5, %xmm1
   2238 	pandn	$res_y(%rsp), %xmm0
   2239 	movdqa	%xmm5, %xmm2
   2240 	pandn	$res_y+0x10(%rsp), %xmm1
   2241 	movdqa	%xmm5, %xmm3
   2242 	pand	$in2_y(%rsp), %xmm2
   2243 	pand	$in2_y+0x10(%rsp), %xmm3
   2244 	por	%xmm0, %xmm2
   2245 	por	%xmm1, %xmm3
   2246 
   2247 	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
   2248 	movdqa	%xmm4, %xmm1
   2249 	pandn	%xmm2, %xmm0
   2250 	movdqa	%xmm4, %xmm2
   2251 	pandn	%xmm3, %xmm1
   2252 	movdqa	%xmm4, %xmm3
   2253 	pand	$in1_y(%rsp), %xmm2
   2254 	pand	$in1_y+0x10(%rsp), %xmm3
   2255 	por	%xmm0, %xmm2
   2256 	por	%xmm1, %xmm3
   2257 	movdqu	%xmm2, 0x20($r_ptr)
   2258 	movdqu	%xmm3, 0x30($r_ptr)
   2259 
   2260 .Ladd_done$x:
   2261 	add	\$32*18+8, %rsp
   2262 	pop	%r15
   2263 	pop	%r14
   2264 	pop	%r13
   2265 	pop	%r12
   2266 	pop	%rbx
   2267 	pop	%rbp
   2268 	ret
   2269 .size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
   2270 ___
   2271 }
   2272 &gen_add("q");
   2273 
   2274 sub gen_add_affine () {
   2275     my $x = shift;
   2276     my ($src0,$sfx,$bias);
   2277     my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
   2278 	$res_x,$res_y,$res_z,
   2279 	$in1_x,$in1_y,$in1_z,
   2280 	$in2_x,$in2_y)=map(32*$_,(0..14));
   2281     my $Z1sqr = $S2;
   2282 
   2283     if ($x ne "x") {
   2284 	$src0 = "%rax";
   2285 	$sfx  = "";
   2286 	$bias = 0;
   2287 
   2288 $code.=<<___;
   2289 .globl	ecp_nistz256_point_add_affine
   2290 .type	ecp_nistz256_point_add_affine,\@function,3
   2291 .align	32
   2292 ecp_nistz256_point_add_affine:
   2293 ___
   2294 $code.=<<___	if ($addx);
   2295 	mov	\$0x80100, %ecx
   2296 	and	OPENSSL_ia32cap_P+8(%rip), %ecx
   2297 	cmp	\$0x80100, %ecx
   2298 	je	.Lpoint_add_affinex
   2299 ___
   2300     } else {
   2301 	$src0 = "%rdx";
   2302 	$sfx  = "x";
   2303 	$bias = 128;
   2304 
   2305 $code.=<<___;
   2306 .type	ecp_nistz256_point_add_affinex,\@function,3
   2307 .align	32
   2308 ecp_nistz256_point_add_affinex:
   2309 .Lpoint_add_affinex:
   2310 ___
   2311     }
   2312 $code.=<<___;
   2313 	push	%rbp
   2314 	push	%rbx
   2315 	push	%r12
   2316 	push	%r13
   2317 	push	%r14
   2318 	push	%r15
   2319 	sub	\$32*15+8, %rsp
   2320 
   2321 	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
   2322 	mov	$b_org, $b_ptr		# reassign
   2323 	movdqu	0x10($a_ptr), %xmm1
   2324 	movdqu	0x20($a_ptr), %xmm2
   2325 	movdqu	0x30($a_ptr), %xmm3
   2326 	movdqu	0x40($a_ptr), %xmm4
   2327 	movdqu	0x50($a_ptr), %xmm5
   2328 	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
   2329 	 mov	0x40+8*1($a_ptr), $acc6
   2330 	 mov	0x40+8*2($a_ptr), $acc7
   2331 	 mov	0x40+8*3($a_ptr), $acc0
   2332 	movdqa	%xmm0, $in1_x(%rsp)
   2333 	movdqa	%xmm1, $in1_x+0x10(%rsp)
   2334 	por	%xmm0, %xmm1
   2335 	movdqa	%xmm2, $in1_y(%rsp)
   2336 	movdqa	%xmm3, $in1_y+0x10(%rsp)
   2337 	por	%xmm2, %xmm3
   2338 	movdqa	%xmm4, $in1_z(%rsp)
   2339 	movdqa	%xmm5, $in1_z+0x10(%rsp)
   2340 	por	%xmm1, %xmm3
   2341 
   2342 	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
   2343 	 pshufd	\$0xb1, %xmm3, %xmm5
   2344 	movdqu	0x10($b_ptr), %xmm1
   2345 	movdqu	0x20($b_ptr), %xmm2
   2346 	 por	%xmm3, %xmm5
   2347 	movdqu	0x30($b_ptr), %xmm3
   2348 	movdqa	%xmm0, $in2_x(%rsp)
   2349 	 pshufd	\$0x1e, %xmm5, %xmm4
   2350 	movdqa	%xmm1, $in2_x+0x10(%rsp)
   2351 	por	%xmm0, %xmm1
   2352 	 movq	$r_ptr, %xmm0		# save $r_ptr
   2353 	movdqa	%xmm2, $in2_y(%rsp)
   2354 	movdqa	%xmm3, $in2_y+0x10(%rsp)
   2355 	por	%xmm2, %xmm3
   2356 	 por	%xmm4, %xmm5
   2357 	 pxor	%xmm4, %xmm4
   2358 	por	%xmm1, %xmm3
   2359 
   2360 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
   2361 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
   2362 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
   2363 
   2364 	pcmpeqd	%xmm4, %xmm5
   2365 	pshufd	\$0xb1, %xmm3, %xmm4
   2366 	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
   2367 	 #lea	0x00($b_ptr), $b_ptr
   2368 	 mov	$acc4, $acc1			# harmonize sqr output and mul input
   2369 	por	%xmm3, %xmm4
   2370 	pshufd	\$0, %xmm5, %xmm5		# in1infty
   2371 	pshufd	\$0x1e, %xmm4, %xmm3
   2372 	 mov	$acc5, $acc2
   2373 	por	%xmm3, %xmm4
   2374 	pxor	%xmm3, %xmm3
   2375 	 mov	$acc6, $acc3
   2376 	pcmpeqd	%xmm3, %xmm4
   2377 	pshufd	\$0, %xmm4, %xmm4		# in2infty
   2378 
   2379 	lea	$Z1sqr-$bias(%rsp), $a_ptr
   2380 	mov	$acc7, $acc4
   2381 	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
   2382 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
   2383 
   2384 	lea	$in1_x(%rsp), $b_ptr
   2385 	lea	$H(%rsp), $r_ptr		# H = U2 - U1
   2386 	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
   2387 
   2388 	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
   2389 	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
   2390 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
   2391 
   2392 	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
   2393 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   2394 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
   2395 
   2396 	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
   2397 	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
   2398 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
   2399 
   2400 	lea	$in1_y(%rsp), $b_ptr
   2401 	lea	$R(%rsp), $r_ptr		# R = S2 - S1
   2402 	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
   2403 
   2404 	`&load_for_sqr("$H(%rsp)", "$src0")`
   2405 	lea	$Hsqr(%rsp), $r_ptr		# H^2
   2406 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
   2407 
   2408 	`&load_for_sqr("$R(%rsp)", "$src0")`
   2409 	lea	$Rsqr(%rsp), $r_ptr		# R^2
   2410 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
   2411 
   2412 	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
   2413 	lea	$Hcub(%rsp), $r_ptr		# H^3
   2414 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
   2415 
   2416 	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
   2417 	lea	$U2(%rsp), $r_ptr		# U1*H^2
   2418 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
   2419 ___
   2420 {
   2421 #######################################################################
   2422 # operate in 4-5-0-1 "name space" that matches multiplication output
   2423 #
   2424 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   2425 my ($poly1, $poly3)=($acc6,$acc7);
   2426 
   2427 $code.=<<___;
   2428 	#lea	$U2(%rsp), $a_ptr
   2429 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
   2430 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
   2431 
   2432 	add	$acc0, $acc0		# a0:a3+a0:a3
   2433 	lea	$Rsqr(%rsp), $a_ptr
   2434 	adc	$acc1, $acc1
   2435 	 mov	$acc0, $t0
   2436 	adc	$acc2, $acc2
   2437 	adc	$acc3, $acc3
   2438 	 mov	$acc1, $t1
   2439 	sbb	$t4, $t4
   2440 
   2441 	sub	\$-1, $acc0
   2442 	 mov	$acc2, $t2
   2443 	sbb	$poly1, $acc1
   2444 	sbb	\$0, $acc2
   2445 	 mov	$acc3, $t3
   2446 	sbb	$poly3, $acc3
   2447 	test	$t4, $t4
   2448 
   2449 	cmovz	$t0, $acc0
   2450 	mov	8*0($a_ptr), $t0
   2451 	cmovz	$t1, $acc1
   2452 	mov	8*1($a_ptr), $t1
   2453 	cmovz	$t2, $acc2
   2454 	mov	8*2($a_ptr), $t2
   2455 	cmovz	$t3, $acc3
   2456 	mov	8*3($a_ptr), $t3
   2457 
   2458 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
   2459 
   2460 	lea	$Hcub(%rsp), $b_ptr
   2461 	lea	$res_x(%rsp), $r_ptr
   2462 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
   2463 
   2464 	mov	$U2+8*0(%rsp), $t0
   2465 	mov	$U2+8*1(%rsp), $t1
   2466 	mov	$U2+8*2(%rsp), $t2
   2467 	mov	$U2+8*3(%rsp), $t3
   2468 	lea	$H(%rsp), $r_ptr
   2469 
   2470 	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
   2471 
   2472 	mov	$acc0, 8*0($r_ptr)		# save the result, as
   2473 	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
   2474 	mov	$acc2, 8*2($r_ptr)
   2475 	mov	$acc3, 8*3($r_ptr)
   2476 ___
   2477 }
   2478 $code.=<<___;
   2479 	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
   2480 	lea	$S2(%rsp), $r_ptr
   2481 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
   2482 
   2483 	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
   2484 	lea	$H(%rsp), $r_ptr
   2485 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
   2486 
   2487 	lea	$S2(%rsp), $b_ptr
   2488 	lea	$res_y(%rsp), $r_ptr
   2489 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
   2490 
   2491 	movq	%xmm0, $r_ptr		# restore $r_ptr
   2492 
   2493 	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
   2494 	movdqa	%xmm5, %xmm1
   2495 	pandn	$res_z(%rsp), %xmm0
   2496 	movdqa	%xmm5, %xmm2
   2497 	pandn	$res_z+0x10(%rsp), %xmm1
   2498 	movdqa	%xmm5, %xmm3
   2499 	pand	.LONE_mont(%rip), %xmm2
   2500 	pand	.LONE_mont+0x10(%rip), %xmm3
   2501 	por	%xmm0, %xmm2
   2502 	por	%xmm1, %xmm3
   2503 
   2504 	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
   2505 	movdqa	%xmm4, %xmm1
   2506 	pandn	%xmm2, %xmm0
   2507 	movdqa	%xmm4, %xmm2
   2508 	pandn	%xmm3, %xmm1
   2509 	movdqa	%xmm4, %xmm3
   2510 	pand	$in1_z(%rsp), %xmm2
   2511 	pand	$in1_z+0x10(%rsp), %xmm3
   2512 	por	%xmm0, %xmm2
   2513 	por	%xmm1, %xmm3
   2514 	movdqu	%xmm2, 0x40($r_ptr)
   2515 	movdqu	%xmm3, 0x50($r_ptr)
   2516 
   2517 	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
   2518 	movdqa	%xmm5, %xmm1
   2519 	pandn	$res_x(%rsp), %xmm0
   2520 	movdqa	%xmm5, %xmm2
   2521 	pandn	$res_x+0x10(%rsp), %xmm1
   2522 	movdqa	%xmm5, %xmm3
   2523 	pand	$in2_x(%rsp), %xmm2
   2524 	pand	$in2_x+0x10(%rsp), %xmm3
   2525 	por	%xmm0, %xmm2
   2526 	por	%xmm1, %xmm3
   2527 
   2528 	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
   2529 	movdqa	%xmm4, %xmm1
   2530 	pandn	%xmm2, %xmm0
   2531 	movdqa	%xmm4, %xmm2
   2532 	pandn	%xmm3, %xmm1
   2533 	movdqa	%xmm4, %xmm3
   2534 	pand	$in1_x(%rsp), %xmm2
   2535 	pand	$in1_x+0x10(%rsp), %xmm3
   2536 	por	%xmm0, %xmm2
   2537 	por	%xmm1, %xmm3
   2538 	movdqu	%xmm2, 0x00($r_ptr)
   2539 	movdqu	%xmm3, 0x10($r_ptr)
   2540 
   2541 	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
   2542 	movdqa	%xmm5, %xmm1
   2543 	pandn	$res_y(%rsp), %xmm0
   2544 	movdqa	%xmm5, %xmm2
   2545 	pandn	$res_y+0x10(%rsp), %xmm1
   2546 	movdqa	%xmm5, %xmm3
   2547 	pand	$in2_y(%rsp), %xmm2
   2548 	pand	$in2_y+0x10(%rsp), %xmm3
   2549 	por	%xmm0, %xmm2
   2550 	por	%xmm1, %xmm3
   2551 
   2552 	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
   2553 	movdqa	%xmm4, %xmm1
   2554 	pandn	%xmm2, %xmm0
   2555 	movdqa	%xmm4, %xmm2
   2556 	pandn	%xmm3, %xmm1
   2557 	movdqa	%xmm4, %xmm3
   2558 	pand	$in1_y(%rsp), %xmm2
   2559 	pand	$in1_y+0x10(%rsp), %xmm3
   2560 	por	%xmm0, %xmm2
   2561 	por	%xmm1, %xmm3
   2562 	movdqu	%xmm2, 0x20($r_ptr)
   2563 	movdqu	%xmm3, 0x30($r_ptr)
   2564 
   2565 	add	\$32*15+8, %rsp
   2566 	pop	%r15
   2567 	pop	%r14
   2568 	pop	%r13
   2569 	pop	%r12
   2570 	pop	%rbx
   2571 	pop	%rbp
   2572 	ret
   2573 .size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
   2574 ___
   2575 }
   2576 &gen_add_affine("q");
   2577 
   2578 ########################################################################
   2579 # AD*X magic
   2580 #
   2581 if ($addx) {								{
   2582 ########################################################################
   2583 # operate in 4-5-0-1 "name space" that matches multiplication output
   2584 #
   2585 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   2586 
   2587 $code.=<<___;
   2588 .type	__ecp_nistz256_add_tox,\@abi-omnipotent
   2589 .align	32
   2590 __ecp_nistz256_add_tox:
   2591 	xor	$t4, $t4
   2592 	adc	8*0($b_ptr), $a0
   2593 	adc	8*1($b_ptr), $a1
   2594 	 mov	$a0, $t0
   2595 	adc	8*2($b_ptr), $a2
   2596 	adc	8*3($b_ptr), $a3
   2597 	 mov	$a1, $t1
   2598 	adc	\$0, $t4
   2599 
   2600 	xor	$t3, $t3
   2601 	sbb	\$-1, $a0
   2602 	 mov	$a2, $t2
   2603 	sbb	$poly1, $a1
   2604 	sbb	\$0, $a2
   2605 	 mov	$a3, $t3
   2606 	sbb	$poly3, $a3
   2607 
   2608 	bt	\$0, $t4
   2609 	cmovnc	$t0, $a0
   2610 	cmovnc	$t1, $a1
   2611 	mov	$a0, 8*0($r_ptr)
   2612 	cmovnc	$t2, $a2
   2613 	mov	$a1, 8*1($r_ptr)
   2614 	cmovnc	$t3, $a3
   2615 	mov	$a2, 8*2($r_ptr)
   2616 	mov	$a3, 8*3($r_ptr)
   2617 
   2618 	ret
   2619 .size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
   2620 
   2621 .type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
   2622 .align	32
   2623 __ecp_nistz256_sub_fromx:
   2624 	xor	$t4, $t4
   2625 	sbb	8*0($b_ptr), $a0
   2626 	sbb	8*1($b_ptr), $a1
   2627 	 mov	$a0, $t0
   2628 	sbb	8*2($b_ptr), $a2
   2629 	sbb	8*3($b_ptr), $a3
   2630 	 mov	$a1, $t1
   2631 	sbb	\$0, $t4
   2632 
   2633 	xor	$t3, $t3
   2634 	adc	\$-1, $a0
   2635 	 mov	$a2, $t2
   2636 	adc	$poly1, $a1
   2637 	adc	\$0, $a2
   2638 	 mov	$a3, $t3
   2639 	adc	$poly3, $a3
   2640 
   2641 	bt	\$0, $t4
   2642 	cmovnc	$t0, $a0
   2643 	cmovnc	$t1, $a1
   2644 	mov	$a0, 8*0($r_ptr)
   2645 	cmovnc	$t2, $a2
   2646 	mov	$a1, 8*1($r_ptr)
   2647 	cmovnc	$t3, $a3
   2648 	mov	$a2, 8*2($r_ptr)
   2649 	mov	$a3, 8*3($r_ptr)
   2650 
   2651 	ret
   2652 .size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
   2653 
   2654 .type	__ecp_nistz256_subx,\@abi-omnipotent
   2655 .align	32
   2656 __ecp_nistz256_subx:
   2657 	xor	$t4, $t4
   2658 	sbb	$a0, $t0
   2659 	sbb	$a1, $t1
   2660 	 mov	$t0, $a0
   2661 	sbb	$a2, $t2
   2662 	sbb	$a3, $t3
   2663 	 mov	$t1, $a1
   2664 	sbb	\$0, $t4
   2665 
   2666 	xor	$a3 ,$a3
   2667 	adc	\$-1, $t0
   2668 	 mov	$t2, $a2
   2669 	adc	$poly1, $t1
   2670 	adc	\$0, $t2
   2671 	 mov	$t3, $a3
   2672 	adc	$poly3, $t3
   2673 
   2674 	bt	\$0, $t4
   2675 	cmovc	$t0, $a0
   2676 	cmovc	$t1, $a1
   2677 	cmovc	$t2, $a2
   2678 	cmovc	$t3, $a3
   2679 
   2680 	ret
   2681 .size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
   2682 
   2683 .type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
   2684 .align	32
   2685 __ecp_nistz256_mul_by_2x:
   2686 	xor	$t4, $t4
   2687 	adc	$a0, $a0		# a0:a3+a0:a3
   2688 	adc	$a1, $a1
   2689 	 mov	$a0, $t0
   2690 	adc	$a2, $a2
   2691 	adc	$a3, $a3
   2692 	 mov	$a1, $t1
   2693 	adc	\$0, $t4
   2694 
   2695 	xor	$t3, $t3
   2696 	sbb	\$-1, $a0
   2697 	 mov	$a2, $t2
   2698 	sbb	$poly1, $a1
   2699 	sbb	\$0, $a2
   2700 	 mov	$a3, $t3
   2701 	sbb	$poly3, $a3
   2702 
   2703 	bt	\$0, $t4
   2704 	cmovnc	$t0, $a0
   2705 	cmovnc	$t1, $a1
   2706 	mov	$a0, 8*0($r_ptr)
   2707 	cmovnc	$t2, $a2
   2708 	mov	$a1, 8*1($r_ptr)
   2709 	cmovnc	$t3, $a3
   2710 	mov	$a2, 8*2($r_ptr)
   2711 	mov	$a3, 8*3($r_ptr)
   2712 
   2713 	ret
   2714 .size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
   2715 ___
   2716 									}
   2717 &gen_double("x");
   2718 &gen_add("x");
   2719 &gen_add_affine("x");
   2720 }
   2721 }}}
   2722 
   2723 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   2724 print $code;
   2725 close STDOUT;
   2726