Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
      4 # Copyright (c) 2015 CloudFlare, Inc.
      5 #
      6 # Licensed under the OpenSSL license (the "License").  You may not use
      7 # this file except in compliance with the License.  You can obtain a copy
      8 # in the file LICENSE in the source distribution or at
      9 # https://www.openssl.org/source/license.html
     10 #
     11 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
     12 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
     13 # (2) University of Haifa, Israel
     14 # (3) CloudFlare, Inc.
     15 #
     16 # Reference:
     17 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
     18 #                          256 Bit Primes"
     19 
     20 # Further optimization by <appro (at] openssl.org>:
     21 #
     22 #		this/original	with/without -DECP_NISTZ256_ASM(*)
     23 # Opteron	+15-49%		+150-195%
     24 # Bulldozer	+18-45%		+175-240%
     25 # P4		+24-46%		+100-150%
     26 # Westmere	+18-34%		+87-160%
     27 # Sandy Bridge	+14-35%		+120-185%
     28 # Ivy Bridge	+11-35%		+125-180%
     29 # Haswell	+10-37%		+160-200%
     30 # Broadwell	+24-58%		+210-270%
     31 # Atom		+20-50%		+180-240%
     32 # VIA Nano	+50-160%	+480-480%
     33 #
     34 # (*)	"without -DECP_NISTZ256_ASM" refers to build with
     35 #	"enable-ec_nistp_64_gcc_128";
     36 #
     37 # Ranges denote minimum and maximum improvement coefficients depending
     38 # on benchmark. In "this/original" column lower coefficient is for
     39 # ECDSA sign, while in "with/without" - for ECDH key agreement, and
     40 # higher - for ECDSA sign, relatively fastest server-side operation.
     41 # Keep in mind that +100% means 2x improvement.
     42 
     43 $flavour = shift;
     44 $output  = shift;
     45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     46 
     47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     48 
     49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     51 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     52 die "can't locate x86_64-xlate.pl";
     53 
     54 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     55 *STDOUT=*OUT;
     56 
     57 $avx = 2;
     58 $addx = 1;
     59 
     60 $code.=<<___;
     61 .text
     62 .extern	OPENSSL_ia32cap_P
     63 
     64 # The polynomial
     65 .align 64
     66 .Lpoly:
     67 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
     68 
     69 .LOne:
     70 .long 1,1,1,1,1,1,1,1
     71 .LTwo:
     72 .long 2,2,2,2,2,2,2,2
     73 .LThree:
     74 .long 3,3,3,3,3,3,3,3
     75 .LONE_mont:
     76 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
     77 
     78 # Constants for computations modulo ord(p256)
     79 .Lord:
     80 .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
     81 .LordK:
     82 .quad 0xccd1c8aaee00bc4f
     83 ___
     84 
     85 {
     86 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
     87 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
     88 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
     89 
     90 $code.=<<___;
     91 
     92 ################################################################################
     93 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
     94 .globl	ecp_nistz256_neg
     95 .type	ecp_nistz256_neg,\@function,2
     96 .align	32
     97 ecp_nistz256_neg:
     98 .cfi_startproc
     99 	push	%r12
    100 .cfi_push	%r12
    101 	push	%r13
    102 .cfi_push	%r13
    103 .Lneg_body:
    104 
    105 	xor	$a0, $a0
    106 	xor	$a1, $a1
    107 	xor	$a2, $a2
    108 	xor	$a3, $a3
    109 	xor	$t4, $t4
    110 
    111 	sub	8*0($a_ptr), $a0
    112 	sbb	8*1($a_ptr), $a1
    113 	sbb	8*2($a_ptr), $a2
    114 	 mov	$a0, $t0
    115 	sbb	8*3($a_ptr), $a3
    116 	lea	.Lpoly(%rip), $a_ptr
    117 	 mov	$a1, $t1
    118 	sbb	\$0, $t4
    119 
    120 	add	8*0($a_ptr), $a0
    121 	 mov	$a2, $t2
    122 	adc	8*1($a_ptr), $a1
    123 	adc	8*2($a_ptr), $a2
    124 	 mov	$a3, $t3
    125 	adc	8*3($a_ptr), $a3
    126 	test	$t4, $t4
    127 
    128 	cmovz	$t0, $a0
    129 	cmovz	$t1, $a1
    130 	mov	$a0, 8*0($r_ptr)
    131 	cmovz	$t2, $a2
    132 	mov	$a1, 8*1($r_ptr)
    133 	cmovz	$t3, $a3
    134 	mov	$a2, 8*2($r_ptr)
    135 	mov	$a3, 8*3($r_ptr)
    136 
    137 	mov	0(%rsp),%r13
    138 .cfi_restore	%r13
    139 	mov	8(%rsp),%r12
    140 .cfi_restore	%r12
    141 	lea	16(%rsp),%rsp
    142 .cfi_adjust_cfa_offset	-16
    143 .Lneg_epilogue:
    144 	ret
    145 .cfi_endproc
    146 .size	ecp_nistz256_neg,.-ecp_nistz256_neg
    147 ___
    148 }
    149 {
    150 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
    151 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
    152 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
    153 my ($poly1,$poly3)=($acc6,$acc7);
    154 
    155 $code.=<<___;
    156 ################################################################################
    157 # void ecp_nistz256_ord_mul_mont(
    158 #   uint64_t res[4],
    159 #   uint64_t a[4],
    160 #   uint64_t b[4]);
    161 
    162 .globl	ecp_nistz256_ord_mul_mont
    163 .type	ecp_nistz256_ord_mul_mont,\@function,3
    164 .align	32
    165 ecp_nistz256_ord_mul_mont:
    166 .cfi_startproc
    167 ___
    168 $code.=<<___	if ($addx);
    169 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
    170 	mov	8(%rcx), %rcx
    171 	and	\$0x80100, %ecx
    172 	cmp	\$0x80100, %ecx
    173 	je	.Lecp_nistz256_ord_mul_montx
    174 ___
    175 $code.=<<___;
    176 	push	%rbp
    177 .cfi_push	%rbp
    178 	push	%rbx
    179 .cfi_push	%rbx
    180 	push	%r12
    181 .cfi_push	%r12
    182 	push	%r13
    183 .cfi_push	%r13
    184 	push	%r14
    185 .cfi_push	%r14
    186 	push	%r15
    187 .cfi_push	%r15
    188 .Lord_mul_body:
    189 
    190 	mov	8*0($b_org), %rax
    191 	mov	$b_org, $b_ptr
    192 	lea	.Lord(%rip), %r14
    193 	mov	.LordK(%rip), %r15
    194 
    195 	################################# * b[0]
    196 	mov	%rax, $t0
    197 	mulq	8*0($a_ptr)
    198 	mov	%rax, $acc0
    199 	mov	$t0, %rax
    200 	mov	%rdx, $acc1
    201 
    202 	mulq	8*1($a_ptr)
    203 	add	%rax, $acc1
    204 	mov	$t0, %rax
    205 	adc	\$0, %rdx
    206 	mov	%rdx, $acc2
    207 
    208 	mulq	8*2($a_ptr)
    209 	add	%rax, $acc2
    210 	mov	$t0, %rax
    211 	adc	\$0, %rdx
    212 
    213 	 mov	$acc0, $acc5
    214 	 imulq	%r15,$acc0
    215 
    216 	mov	%rdx, $acc3
    217 	mulq	8*3($a_ptr)
    218 	add	%rax, $acc3
    219 	 mov	$acc0, %rax
    220 	adc	\$0, %rdx
    221 	mov	%rdx, $acc4
    222 
    223 	################################# First reduction step
    224 	mulq	8*0(%r14)
    225 	mov	$acc0, $t1
    226 	add	%rax, $acc5		# guaranteed to be zero
    227 	mov	$acc0, %rax
    228 	adc	\$0, %rdx
    229 	mov	%rdx, $t0
    230 
    231 	sub	$acc0, $acc2
    232 	sbb	\$0, $acc0		# can't borrow
    233 
    234 	mulq	8*1(%r14)
    235 	add	$t0, $acc1
    236 	adc	\$0, %rdx
    237 	add	%rax, $acc1
    238 	mov	$t1, %rax
    239 	adc	%rdx, $acc2
    240 	mov	$t1, %rdx
    241 	adc	\$0, $acc0		# can't overflow
    242 
    243 	shl	\$32, %rax
    244 	shr	\$32, %rdx
    245 	sub	%rax, $acc3
    246 	 mov	8*1($b_ptr), %rax
    247 	sbb	%rdx, $t1		# can't borrow
    248 
    249 	add	$acc0, $acc3
    250 	adc	$t1, $acc4
    251 	adc	\$0, $acc5
    252 
    253 	################################# * b[1]
    254 	mov	%rax, $t0
    255 	mulq	8*0($a_ptr)
    256 	add	%rax, $acc1
    257 	mov	$t0, %rax
    258 	adc	\$0, %rdx
    259 	mov	%rdx, $t1
    260 
    261 	mulq	8*1($a_ptr)
    262 	add	$t1, $acc2
    263 	adc	\$0, %rdx
    264 	add	%rax, $acc2
    265 	mov	$t0, %rax
    266 	adc	\$0, %rdx
    267 	mov	%rdx, $t1
    268 
    269 	mulq	8*2($a_ptr)
    270 	add	$t1, $acc3
    271 	adc	\$0, %rdx
    272 	add	%rax, $acc3
    273 	mov	$t0, %rax
    274 	adc	\$0, %rdx
    275 
    276 	 mov	$acc1, $t0
    277 	 imulq	%r15, $acc1
    278 
    279 	mov	%rdx, $t1
    280 	mulq	8*3($a_ptr)
    281 	add	$t1, $acc4
    282 	adc	\$0, %rdx
    283 	xor	$acc0, $acc0
    284 	add	%rax, $acc4
    285 	 mov	$acc1, %rax
    286 	adc	%rdx, $acc5
    287 	adc	\$0, $acc0
    288 
    289 	################################# Second reduction step
    290 	mulq	8*0(%r14)
    291 	mov	$acc1, $t1
    292 	add	%rax, $t0		# guaranteed to be zero
    293 	mov	$acc1, %rax
    294 	adc	%rdx, $t0
    295 
    296 	sub	$acc1, $acc3
    297 	sbb	\$0, $acc1		# can't borrow
    298 
    299 	mulq	8*1(%r14)
    300 	add	$t0, $acc2
    301 	adc	\$0, %rdx
    302 	add	%rax, $acc2
    303 	mov	$t1, %rax
    304 	adc	%rdx, $acc3
    305 	mov	$t1, %rdx
    306 	adc	\$0, $acc1		# can't overflow
    307 
    308 	shl	\$32, %rax
    309 	shr	\$32, %rdx
    310 	sub	%rax, $acc4
    311 	 mov	8*2($b_ptr), %rax
    312 	sbb	%rdx, $t1		# can't borrow
    313 
    314 	add	$acc1, $acc4
    315 	adc	$t1, $acc5
    316 	adc	\$0, $acc0
    317 
    318 	################################## * b[2]
    319 	mov	%rax, $t0
    320 	mulq	8*0($a_ptr)
    321 	add	%rax, $acc2
    322 	mov	$t0, %rax
    323 	adc	\$0, %rdx
    324 	mov	%rdx, $t1
    325 
    326 	mulq	8*1($a_ptr)
    327 	add	$t1, $acc3
    328 	adc	\$0, %rdx
    329 	add	%rax, $acc3
    330 	mov	$t0, %rax
    331 	adc	\$0, %rdx
    332 	mov	%rdx, $t1
    333 
    334 	mulq	8*2($a_ptr)
    335 	add	$t1, $acc4
    336 	adc	\$0, %rdx
    337 	add	%rax, $acc4
    338 	mov	$t0, %rax
    339 	adc	\$0, %rdx
    340 
    341 	 mov	$acc2, $t0
    342 	 imulq	%r15, $acc2
    343 
    344 	mov	%rdx, $t1
    345 	mulq	8*3($a_ptr)
    346 	add	$t1, $acc5
    347 	adc	\$0, %rdx
    348 	xor	$acc1, $acc1
    349 	add	%rax, $acc5
    350 	 mov	$acc2, %rax
    351 	adc	%rdx, $acc0
    352 	adc	\$0, $acc1
    353 
    354 	################################# Third reduction step
    355 	mulq	8*0(%r14)
    356 	mov	$acc2, $t1
    357 	add	%rax, $t0		# guaranteed to be zero
    358 	mov	$acc2, %rax
    359 	adc	%rdx, $t0
    360 
    361 	sub	$acc2, $acc4
    362 	sbb	\$0, $acc2		# can't borrow
    363 
    364 	mulq	8*1(%r14)
    365 	add	$t0, $acc3
    366 	adc	\$0, %rdx
    367 	add	%rax, $acc3
    368 	mov	$t1, %rax
    369 	adc	%rdx, $acc4
    370 	mov	$t1, %rdx
    371 	adc	\$0, $acc2		# can't overflow
    372 
    373 	shl	\$32, %rax
    374 	shr	\$32, %rdx
    375 	sub	%rax, $acc5
    376 	 mov	8*3($b_ptr), %rax
    377 	sbb	%rdx, $t1		# can't borrow
    378 
    379 	add	$acc2, $acc5
    380 	adc	$t1, $acc0
    381 	adc	\$0, $acc1
    382 
    383 	################################# * b[3]
    384 	mov	%rax, $t0
    385 	mulq	8*0($a_ptr)
    386 	add	%rax, $acc3
    387 	mov	$t0, %rax
    388 	adc	\$0, %rdx
    389 	mov	%rdx, $t1
    390 
    391 	mulq	8*1($a_ptr)
    392 	add	$t1, $acc4
    393 	adc	\$0, %rdx
    394 	add	%rax, $acc4
    395 	mov	$t0, %rax
    396 	adc	\$0, %rdx
    397 	mov	%rdx, $t1
    398 
    399 	mulq	8*2($a_ptr)
    400 	add	$t1, $acc5
    401 	adc	\$0, %rdx
    402 	add	%rax, $acc5
    403 	mov	$t0, %rax
    404 	adc	\$0, %rdx
    405 
    406 	 mov	$acc3, $t0
    407 	 imulq	%r15, $acc3
    408 
    409 	mov	%rdx, $t1
    410 	mulq	8*3($a_ptr)
    411 	add	$t1, $acc0
    412 	adc	\$0, %rdx
    413 	xor	$acc2, $acc2
    414 	add	%rax, $acc0
    415 	 mov	$acc3, %rax
    416 	adc	%rdx, $acc1
    417 	adc	\$0, $acc2
    418 
    419 	################################# Last reduction step
    420 	mulq	8*0(%r14)
    421 	mov	$acc3, $t1
    422 	add	%rax, $t0		# guaranteed to be zero
    423 	mov	$acc3, %rax
    424 	adc	%rdx, $t0
    425 
    426 	sub	$acc3, $acc5
    427 	sbb	\$0, $acc3		# can't borrow
    428 
    429 	mulq	8*1(%r14)
    430 	add	$t0, $acc4
    431 	adc	\$0, %rdx
    432 	add	%rax, $acc4
    433 	mov	$t1, %rax
    434 	adc	%rdx, $acc5
    435 	mov	$t1, %rdx
    436 	adc	\$0, $acc3		# can't overflow
    437 
    438 	shl	\$32, %rax
    439 	shr	\$32, %rdx
    440 	sub	%rax, $acc0
    441 	sbb	%rdx, $t1		# can't borrow
    442 
    443 	add	$acc3, $acc0
    444 	adc	$t1, $acc1
    445 	adc	\$0, $acc2
    446 
    447 	################################# Subtract ord
    448 	 mov	$acc4, $a_ptr
    449 	sub	8*0(%r14), $acc4
    450 	 mov	$acc5, $acc3
    451 	sbb	8*1(%r14), $acc5
    452 	 mov	$acc0, $t0
    453 	sbb	8*2(%r14), $acc0
    454 	 mov	$acc1, $t1
    455 	sbb	8*3(%r14), $acc1
    456 	sbb	\$0, $acc2
    457 
    458 	cmovc	$a_ptr, $acc4
    459 	cmovc	$acc3, $acc5
    460 	cmovc	$t0, $acc0
    461 	cmovc	$t1, $acc1
    462 
    463 	mov	$acc4, 8*0($r_ptr)
    464 	mov	$acc5, 8*1($r_ptr)
    465 	mov	$acc0, 8*2($r_ptr)
    466 	mov	$acc1, 8*3($r_ptr)
    467 
    468 	mov	0(%rsp),%r15
    469 .cfi_restore	%r15
    470 	mov	8(%rsp),%r14
    471 .cfi_restore	%r14
    472 	mov	16(%rsp),%r13
    473 .cfi_restore	%r13
    474 	mov	24(%rsp),%r12
    475 .cfi_restore	%r12
    476 	mov	32(%rsp),%rbx
    477 .cfi_restore	%rbx
    478 	mov	40(%rsp),%rbp
    479 .cfi_restore	%rbp
    480 	lea	48(%rsp),%rsp
    481 .cfi_adjust_cfa_offset	-48
    482 .Lord_mul_epilogue:
    483 	ret
    484 .cfi_endproc
    485 .size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
    486 
    487 ################################################################################
    488 # void ecp_nistz256_ord_sqr_mont(
    489 #   uint64_t res[4],
    490 #   uint64_t a[4],
    491 #   uint64_t rep);
    492 
    493 .globl	ecp_nistz256_ord_sqr_mont
    494 .type	ecp_nistz256_ord_sqr_mont,\@function,3
    495 .align	32
    496 ecp_nistz256_ord_sqr_mont:
    497 .cfi_startproc
    498 ___
    499 $code.=<<___	if ($addx);
    500 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
    501 	mov	8(%rcx), %rcx
    502 	and	\$0x80100, %ecx
    503 	cmp	\$0x80100, %ecx
    504 	je	.Lecp_nistz256_ord_sqr_montx
    505 ___
    506 $code.=<<___;
    507 	push	%rbp
    508 .cfi_push	%rbp
    509 	push	%rbx
    510 .cfi_push	%rbx
    511 	push	%r12
    512 .cfi_push	%r12
    513 	push	%r13
    514 .cfi_push	%r13
    515 	push	%r14
    516 .cfi_push	%r14
    517 	push	%r15
    518 .cfi_push	%r15
    519 .Lord_sqr_body:
    520 
    521 	mov	8*0($a_ptr), $acc0
    522 	mov	8*1($a_ptr), %rax
    523 	mov	8*2($a_ptr), $acc6
    524 	mov	8*3($a_ptr), $acc7
    525 	lea	.Lord(%rip), $a_ptr	# pointer to modulus
    526 	mov	$b_org, $b_ptr
    527 	jmp	.Loop_ord_sqr
    528 
    529 .align	32
    530 .Loop_ord_sqr:
    531 	################################# a[1:] * a[0]
    532 	mov	%rax, $t1		# put aside a[1]
    533 	mul	$acc0			# a[1] * a[0]
    534 	mov	%rax, $acc1
    535 	movq	$t1, %xmm1		# offload a[1]
    536 	mov	$acc6, %rax
    537 	mov	%rdx, $acc2
    538 
    539 	mul	$acc0			# a[2] * a[0]
    540 	add	%rax, $acc2
    541 	mov	$acc7, %rax
    542 	movq	$acc6, %xmm2		# offload a[2]
    543 	adc	\$0, %rdx
    544 	mov	%rdx, $acc3
    545 
    546 	mul	$acc0			# a[3] * a[0]
    547 	add	%rax, $acc3
    548 	mov	$acc7, %rax
    549 	movq	$acc7, %xmm3		# offload a[3]
    550 	adc	\$0, %rdx
    551 	mov	%rdx, $acc4
    552 
    553 	################################# a[3] * a[2]
    554 	mul	$acc6			# a[3] * a[2]
    555 	mov	%rax, $acc5
    556 	mov	$acc6, %rax
    557 	mov	%rdx, $acc6
    558 
    559 	################################# a[2:] * a[1]
    560 	mul	$t1			# a[2] * a[1]
    561 	add	%rax, $acc3
    562 	mov	$acc7, %rax
    563 	adc	\$0, %rdx
    564 	mov	%rdx, $acc7
    565 
    566 	mul	$t1			# a[3] * a[1]
    567 	add	%rax, $acc4
    568 	adc	\$0, %rdx
    569 
    570 	add	$acc7, $acc4
    571 	adc	%rdx, $acc5
    572 	adc	\$0, $acc6		# can't overflow
    573 
    574 	################################# *2
    575 	xor	$acc7, $acc7
    576 	mov	$acc0, %rax
    577 	add	$acc1, $acc1
    578 	adc	$acc2, $acc2
    579 	adc	$acc3, $acc3
    580 	adc	$acc4, $acc4
    581 	adc	$acc5, $acc5
    582 	adc	$acc6, $acc6
    583 	adc	\$0, $acc7
    584 
    585 	################################# Missing products
    586 	mul	%rax			# a[0] * a[0]
    587 	mov	%rax, $acc0
    588 	movq	%xmm1, %rax
    589 	mov	%rdx, $t1
    590 
    591 	mul	%rax			# a[1] * a[1]
    592 	add	$t1, $acc1
    593 	adc	%rax, $acc2
    594 	movq	%xmm2, %rax
    595 	adc	\$0, %rdx
    596 	mov	%rdx, $t1
    597 
    598 	mul	%rax			# a[2] * a[2]
    599 	add	$t1, $acc3
    600 	adc	%rax, $acc4
    601 	movq	%xmm3, %rax
    602 	adc	\$0, %rdx
    603 	mov	%rdx, $t1
    604 
    605 	 mov	$acc0, $t0
    606 	 imulq	8*4($a_ptr), $acc0	# *= .LordK
    607 
    608 	mul	%rax			# a[3] * a[3]
    609 	add	$t1, $acc5
    610 	adc	%rax, $acc6
    611 	 mov	8*0($a_ptr), %rax	# modulus[0]
    612 	adc	%rdx, $acc7		# can't overflow
    613 
    614 	################################# First reduction step
    615 	mul	$acc0
    616 	mov	$acc0, $t1
    617 	add	%rax, $t0		# guaranteed to be zero
    618 	mov	8*1($a_ptr), %rax	# modulus[1]
    619 	adc	%rdx, $t0
    620 
    621 	sub	$acc0, $acc2
    622 	sbb	\$0, $t1		# can't borrow
    623 
    624 	mul	$acc0
    625 	add	$t0, $acc1
    626 	adc	\$0, %rdx
    627 	add	%rax, $acc1
    628 	mov	$acc0, %rax
    629 	adc	%rdx, $acc2
    630 	mov	$acc0, %rdx
    631 	adc	\$0, $t1		# can't overflow
    632 
    633 	 mov	$acc1, $t0
    634 	 imulq	8*4($a_ptr), $acc1	# *= .LordK
    635 
    636 	shl	\$32, %rax
    637 	shr	\$32, %rdx
    638 	sub	%rax, $acc3
    639 	 mov	8*0($a_ptr), %rax
    640 	sbb	%rdx, $acc0		# can't borrow
    641 
    642 	add	$t1, $acc3
    643 	adc	\$0, $acc0		# can't overflow
    644 
    645 	################################# Second reduction step
    646 	mul	$acc1
    647 	mov	$acc1, $t1
    648 	add	%rax, $t0		# guaranteed to be zero
    649 	mov	8*1($a_ptr), %rax
    650 	adc	%rdx, $t0
    651 
    652 	sub	$acc1, $acc3
    653 	sbb	\$0, $t1		# can't borrow
    654 
    655 	mul	$acc1
    656 	add	$t0, $acc2
    657 	adc	\$0, %rdx
    658 	add	%rax, $acc2
    659 	mov	$acc1, %rax
    660 	adc	%rdx, $acc3
    661 	mov	$acc1, %rdx
    662 	adc	\$0, $t1		# can't overflow
    663 
    664 	 mov	$acc2, $t0
    665 	 imulq	8*4($a_ptr), $acc2	# *= .LordK
    666 
    667 	shl	\$32, %rax
    668 	shr	\$32, %rdx
    669 	sub	%rax, $acc0
    670 	 mov	8*0($a_ptr), %rax
    671 	sbb	%rdx, $acc1		# can't borrow
    672 
    673 	add	$t1, $acc0
    674 	adc	\$0, $acc1		# can't overflow
    675 
    676 	################################# Third reduction step
    677 	mul	$acc2
    678 	mov	$acc2, $t1
    679 	add	%rax, $t0		# guaranteed to be zero
    680 	mov	8*1($a_ptr), %rax
    681 	adc	%rdx, $t0
    682 
    683 	sub	$acc2, $acc0
    684 	sbb	\$0, $t1		# can't borrow
    685 
    686 	mul	$acc2
    687 	add	$t0, $acc3
    688 	adc	\$0, %rdx
    689 	add	%rax, $acc3
    690 	mov	$acc2, %rax
    691 	adc	%rdx, $acc0
    692 	mov	$acc2, %rdx
    693 	adc	\$0, $t1		# can't overflow
    694 
    695 	 mov	$acc3, $t0
    696 	 imulq	8*4($a_ptr), $acc3	# *= .LordK
    697 
    698 	shl	\$32, %rax
    699 	shr	\$32, %rdx
    700 	sub	%rax, $acc1
    701 	 mov	8*0($a_ptr), %rax
    702 	sbb	%rdx, $acc2		# can't borrow
    703 
    704 	add	$t1, $acc1
    705 	adc	\$0, $acc2		# can't overflow
    706 
    707 	################################# Last reduction step
    708 	mul	$acc3
    709 	mov	$acc3, $t1
    710 	add	%rax, $t0		# guaranteed to be zero
    711 	mov	8*1($a_ptr), %rax
    712 	adc	%rdx, $t0
    713 
    714 	sub	$acc3, $acc1
    715 	sbb	\$0, $t1		# can't borrow
    716 
    717 	mul	$acc3
    718 	add	$t0, $acc0
    719 	adc	\$0, %rdx
    720 	add	%rax, $acc0
    721 	mov	$acc3, %rax
    722 	adc	%rdx, $acc1
    723 	mov	$acc3, %rdx
    724 	adc	\$0, $t1		# can't overflow
    725 
    726 	shl	\$32, %rax
    727 	shr	\$32, %rdx
    728 	sub	%rax, $acc2
    729 	sbb	%rdx, $acc3		# can't borrow
    730 
    731 	add	$t1, $acc2
    732 	adc	\$0, $acc3		# can't overflow
    733 
    734 	################################# Add bits [511:256] of the sqr result
    735 	xor	%rdx, %rdx
    736 	add	$acc4, $acc0
    737 	adc	$acc5, $acc1
    738 	 mov	$acc0, $acc4
    739 	adc	$acc6, $acc2
    740 	adc	$acc7, $acc3
    741 	 mov	$acc1, %rax
    742 	adc	\$0, %rdx
    743 
    744 	################################# Compare to modulus
    745 	sub	8*0($a_ptr), $acc0
    746 	 mov	$acc2, $acc6
    747 	sbb	8*1($a_ptr), $acc1
    748 	sbb	8*2($a_ptr), $acc2
    749 	 mov	$acc3, $acc7
    750 	sbb	8*3($a_ptr), $acc3
    751 	sbb	\$0, %rdx
    752 
    753 	cmovc	$acc4, $acc0
    754 	cmovnc	$acc1, %rax
    755 	cmovnc	$acc2, $acc6
    756 	cmovnc	$acc3, $acc7
    757 
    758 	dec	$b_ptr
    759 	jnz	.Loop_ord_sqr
    760 
    761 	mov	$acc0, 8*0($r_ptr)
    762 	mov	%rax,  8*1($r_ptr)
    763 	pxor	%xmm1, %xmm1
    764 	mov	$acc6, 8*2($r_ptr)
    765 	pxor	%xmm2, %xmm2
    766 	mov	$acc7, 8*3($r_ptr)
    767 	pxor	%xmm3, %xmm3
    768 
    769 	mov	0(%rsp),%r15
    770 .cfi_restore	%r15
    771 	mov	8(%rsp),%r14
    772 .cfi_restore	%r14
    773 	mov	16(%rsp),%r13
    774 .cfi_restore	%r13
    775 	mov	24(%rsp),%r12
    776 .cfi_restore	%r12
    777 	mov	32(%rsp),%rbx
    778 .cfi_restore	%rbx
    779 	mov	40(%rsp),%rbp
    780 .cfi_restore	%rbp
    781 	lea	48(%rsp),%rsp
    782 .cfi_adjust_cfa_offset	-48
    783 .Lord_sqr_epilogue:
    784 	ret
    785 .cfi_endproc
    786 .size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
    787 ___
    788 
    789 $code.=<<___	if ($addx);
    790 ################################################################################
    791 .type	ecp_nistz256_ord_mul_montx,\@function,3
    792 .align	32
    793 ecp_nistz256_ord_mul_montx:
    794 .cfi_startproc
    795 .Lecp_nistz256_ord_mul_montx:
    796 	push	%rbp
    797 .cfi_push	%rbp
    798 	push	%rbx
    799 .cfi_push	%rbx
    800 	push	%r12
    801 .cfi_push	%r12
    802 	push	%r13
    803 .cfi_push	%r13
    804 	push	%r14
    805 .cfi_push	%r14
    806 	push	%r15
    807 .cfi_push	%r15
    808 .Lord_mulx_body:
    809 
    810 	mov	$b_org, $b_ptr
    811 	mov	8*0($b_org), %rdx
    812 	mov	8*0($a_ptr), $acc1
    813 	mov	8*1($a_ptr), $acc2
    814 	mov	8*2($a_ptr), $acc3
    815 	mov	8*3($a_ptr), $acc4
    816 	lea	-128($a_ptr), $a_ptr	# control u-op density
    817 	lea	.Lord-128(%rip), %r14
    818 	mov	.LordK(%rip), %r15
    819 
    820 	################################# Multiply by b[0]
    821 	mulx	$acc1, $acc0, $acc1
    822 	mulx	$acc2, $t0, $acc2
    823 	mulx	$acc3, $t1, $acc3
    824 	add	$t0, $acc1
    825 	mulx	$acc4, $t0, $acc4
    826 	 mov	$acc0, %rdx
    827 	 mulx	%r15, %rdx, %rax
    828 	adc	$t1, $acc2
    829 	adc	$t0, $acc3
    830 	adc	\$0, $acc4
    831 
    832 	################################# reduction
    833 	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
    834 	mulx	8*0+128(%r14), $t0, $t1
    835 	adcx	$t0, $acc0		# guaranteed to be zero
    836 	adox	$t1, $acc1
    837 
    838 	mulx	8*1+128(%r14), $t0, $t1
    839 	adcx	$t0, $acc1
    840 	adox	$t1, $acc2
    841 
    842 	mulx	8*2+128(%r14), $t0, $t1
    843 	adcx	$t0, $acc2
    844 	adox	$t1, $acc3
    845 
    846 	mulx	8*3+128(%r14), $t0, $t1
    847 	 mov	8*1($b_ptr), %rdx
    848 	adcx	$t0, $acc3
    849 	adox	$t1, $acc4
    850 	adcx	$acc0, $acc4
    851 	adox	$acc0, $acc5
    852 	adc	\$0, $acc5		# cf=0, of=0
    853 
    854 	################################# Multiply by b[1]
    855 	mulx	8*0+128($a_ptr), $t0, $t1
    856 	adcx	$t0, $acc1
    857 	adox	$t1, $acc2
    858 
    859 	mulx	8*1+128($a_ptr), $t0, $t1
    860 	adcx	$t0, $acc2
    861 	adox	$t1, $acc3
    862 
    863 	mulx	8*2+128($a_ptr), $t0, $t1
    864 	adcx	$t0, $acc3
    865 	adox	$t1, $acc4
    866 
    867 	mulx	8*3+128($a_ptr), $t0, $t1
    868 	 mov	$acc1, %rdx
    869 	 mulx	%r15, %rdx, %rax
    870 	adcx	$t0, $acc4
    871 	adox	$t1, $acc5
    872 
    873 	adcx	$acc0, $acc5
    874 	adox	$acc0, $acc0
    875 	adc	\$0, $acc0		# cf=0, of=0
    876 
    877 	################################# reduction
    878 	mulx	8*0+128(%r14), $t0, $t1
    879 	adcx	$t0, $acc1		# guaranteed to be zero
    880 	adox	$t1, $acc2
    881 
    882 	mulx	8*1+128(%r14), $t0, $t1
    883 	adcx	$t0, $acc2
    884 	adox	$t1, $acc3
    885 
    886 	mulx	8*2+128(%r14), $t0, $t1
    887 	adcx	$t0, $acc3
    888 	adox	$t1, $acc4
    889 
    890 	mulx	8*3+128(%r14), $t0, $t1
    891 	 mov	8*2($b_ptr), %rdx
    892 	adcx	$t0, $acc4
    893 	adox	$t1, $acc5
    894 	adcx	$acc1, $acc5
    895 	adox	$acc1, $acc0
    896 	adc	\$0, $acc0		# cf=0, of=0
    897 
    898 	################################# Multiply by b[2]
    899 	mulx	8*0+128($a_ptr), $t0, $t1
    900 	adcx	$t0, $acc2
    901 	adox	$t1, $acc3
    902 
    903 	mulx	8*1+128($a_ptr), $t0, $t1
    904 	adcx	$t0, $acc3
    905 	adox	$t1, $acc4
    906 
    907 	mulx	8*2+128($a_ptr), $t0, $t1
    908 	adcx	$t0, $acc4
    909 	adox	$t1, $acc5
    910 
    911 	mulx	8*3+128($a_ptr), $t0, $t1
    912 	 mov	$acc2, %rdx
    913 	 mulx	%r15, %rdx, %rax
    914 	adcx	$t0, $acc5
    915 	adox	$t1, $acc0
    916 
    917 	adcx	$acc1, $acc0
    918 	adox	$acc1, $acc1
    919 	adc	\$0, $acc1		# cf=0, of=0
    920 
    921 	################################# reduction
    922 	mulx	8*0+128(%r14), $t0, $t1
    923 	adcx	$t0, $acc2		# guaranteed to be zero
    924 	adox	$t1, $acc3
    925 
    926 	mulx	8*1+128(%r14), $t0, $t1
    927 	adcx	$t0, $acc3
    928 	adox	$t1, $acc4
    929 
    930 	mulx	8*2+128(%r14), $t0, $t1
    931 	adcx	$t0, $acc4
    932 	adox	$t1, $acc5
    933 
    934 	mulx	8*3+128(%r14), $t0, $t1
    935 	 mov	8*3($b_ptr), %rdx
    936 	adcx	$t0, $acc5
    937 	adox	$t1, $acc0
    938 	adcx	$acc2, $acc0
    939 	adox	$acc2, $acc1
    940 	adc	\$0, $acc1		# cf=0, of=0
    941 
    942 	################################# Multiply by b[3]
    943 	mulx	8*0+128($a_ptr), $t0, $t1
    944 	adcx	$t0, $acc3
    945 	adox	$t1, $acc4
    946 
    947 	mulx	8*1+128($a_ptr), $t0, $t1
    948 	adcx	$t0, $acc4
    949 	adox	$t1, $acc5
    950 
    951 	mulx	8*2+128($a_ptr), $t0, $t1
    952 	adcx	$t0, $acc5
    953 	adox	$t1, $acc0
    954 
    955 	mulx	8*3+128($a_ptr), $t0, $t1
    956 	 mov	$acc3, %rdx
    957 	 mulx	%r15, %rdx, %rax
    958 	adcx	$t0, $acc0
    959 	adox	$t1, $acc1
    960 
    961 	adcx	$acc2, $acc1
    962 	adox	$acc2, $acc2
    963 	adc	\$0, $acc2		# cf=0, of=0
    964 
    965 	################################# reduction
    966 	mulx	8*0+128(%r14), $t0, $t1
    967 	adcx	$t0, $acc3		# guranteed to be zero
    968 	adox	$t1, $acc4
    969 
    970 	mulx	8*1+128(%r14), $t0, $t1
    971 	adcx	$t0, $acc4
    972 	adox	$t1, $acc5
    973 
    974 	mulx	8*2+128(%r14), $t0, $t1
    975 	adcx	$t0, $acc5
    976 	adox	$t1, $acc0
    977 
    978 	mulx	8*3+128(%r14), $t0, $t1
    979 	lea	128(%r14),%r14
    980 	 mov	$acc4, $t2
    981 	adcx	$t0, $acc0
    982 	adox	$t1, $acc1
    983 	 mov	$acc5, $t3
    984 	adcx	$acc3, $acc1
    985 	adox	$acc3, $acc2
    986 	adc	\$0, $acc2
    987 
    988 	#################################
    989 	# Branch-less conditional subtraction of P
    990 	 mov	$acc0, $t0
    991 	sub	8*0(%r14), $acc4
    992 	sbb	8*1(%r14), $acc5
    993 	sbb	8*2(%r14), $acc0
    994 	 mov	$acc1, $t1
    995 	sbb	8*3(%r14), $acc1
    996 	sbb	\$0, $acc2
    997 
    998 	cmovc	$t2, $acc4
    999 	cmovc	$t3, $acc5
   1000 	cmovc	$t0, $acc0
   1001 	cmovc	$t1, $acc1
   1002 
   1003 	mov	$acc4, 8*0($r_ptr)
   1004 	mov	$acc5, 8*1($r_ptr)
   1005 	mov	$acc0, 8*2($r_ptr)
   1006 	mov	$acc1, 8*3($r_ptr)
   1007 
   1008 	mov	0(%rsp),%r15
   1009 .cfi_restore	%r15
   1010 	mov	8(%rsp),%r14
   1011 .cfi_restore	%r14
   1012 	mov	16(%rsp),%r13
   1013 .cfi_restore	%r13
   1014 	mov	24(%rsp),%r12
   1015 .cfi_restore	%r12
   1016 	mov	32(%rsp),%rbx
   1017 .cfi_restore	%rbx
   1018 	mov	40(%rsp),%rbp
   1019 .cfi_restore	%rbp
   1020 	lea	48(%rsp),%rsp
   1021 .cfi_adjust_cfa_offset	-48
   1022 .Lord_mulx_epilogue:
   1023 	ret
   1024 .cfi_endproc
   1025 .size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
   1026 
   1027 .type	ecp_nistz256_ord_sqr_montx,\@function,3
   1028 .align	32
   1029 ecp_nistz256_ord_sqr_montx:
   1030 .cfi_startproc
   1031 .Lecp_nistz256_ord_sqr_montx:
   1032 	push	%rbp
   1033 .cfi_push	%rbp
   1034 	push	%rbx
   1035 .cfi_push	%rbx
   1036 	push	%r12
   1037 .cfi_push	%r12
   1038 	push	%r13
   1039 .cfi_push	%r13
   1040 	push	%r14
   1041 .cfi_push	%r14
   1042 	push	%r15
   1043 .cfi_push	%r15
   1044 .Lord_sqrx_body:
   1045 
   1046 	mov	$b_org, $b_ptr
   1047 	mov	8*0($a_ptr), %rdx
   1048 	mov	8*1($a_ptr), $acc6
   1049 	mov	8*2($a_ptr), $acc7
   1050 	mov	8*3($a_ptr), $acc0
   1051 	lea	.Lord(%rip), $a_ptr
   1052 	jmp	.Loop_ord_sqrx
   1053 
   1054 .align	32
   1055 .Loop_ord_sqrx:
   1056 	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
   1057 	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
   1058 	 mov	%rdx, %rax		# offload a[0]
   1059 	 movq	$acc6, %xmm1		# offload a[1]
   1060 	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
   1061 	 mov	$acc6, %rdx
   1062 	add	$t0, $acc2
   1063 	 movq	$acc7, %xmm2		# offload a[2]
   1064 	adc	$t1, $acc3
   1065 	adc	\$0, $acc4
   1066 	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
   1067 	#################################
   1068 	mulx	$acc7, $t0, $t1		# a[1]*a[2]
   1069 	adcx	$t0, $acc3
   1070 	adox	$t1, $acc4
   1071 
   1072 	mulx	$acc0, $t0, $t1		# a[1]*a[3]
   1073 	 mov	$acc7, %rdx
   1074 	adcx	$t0, $acc4
   1075 	adox	$t1, $acc5
   1076 	adc	\$0, $acc5
   1077 	#################################
   1078 	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
   1079 	mov	%rax, %rdx
   1080 	 movq	$acc0, %xmm3		# offload a[3]
   1081 	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
   1082 	 adcx	$acc1, $acc1		# acc1:6<<1
   1083 	adox	$t0, $acc5
   1084 	 adcx	$acc2, $acc2
   1085 	adox	$acc7, $acc6		# of=0
   1086 
   1087 	################################# a[i]*a[i]
   1088 	mulx	%rdx, $acc0, $t1
   1089 	movq	%xmm1, %rdx
   1090 	 adcx	$acc3, $acc3
   1091 	adox	$t1, $acc1
   1092 	 adcx	$acc4, $acc4
   1093 	mulx	%rdx, $t0, $t4
   1094 	movq	%xmm2, %rdx
   1095 	 adcx	$acc5, $acc5
   1096 	adox	$t0, $acc2
   1097 	 adcx	$acc6, $acc6
   1098 	mulx	%rdx, $t0, $t1
   1099 	.byte	0x67
   1100 	movq	%xmm3, %rdx
   1101 	adox	$t4, $acc3
   1102 	 adcx	$acc7, $acc7
   1103 	adox	$t0, $acc4
   1104 	adox	$t1, $acc5
   1105 	mulx	%rdx, $t0, $t4
   1106 	adox	$t0, $acc6
   1107 	adox	$t4, $acc7
   1108 
   1109 	################################# reduction
   1110 	mov	$acc0, %rdx
   1111 	mulx	8*4($a_ptr), %rdx, $t0
   1112 
   1113 	xor	%rax, %rax		# cf=0, of=0
   1114 	mulx	8*0($a_ptr), $t0, $t1
   1115 	adcx	$t0, $acc0		# guaranteed to be zero
   1116 	adox	$t1, $acc1
   1117 	mulx	8*1($a_ptr), $t0, $t1
   1118 	adcx	$t0, $acc1
   1119 	adox	$t1, $acc2
   1120 	mulx	8*2($a_ptr), $t0, $t1
   1121 	adcx	$t0, $acc2
   1122 	adox	$t1, $acc3
   1123 	mulx	8*3($a_ptr), $t0, $t1
   1124 	adcx	$t0, $acc3
   1125 	adox	$t1, $acc0		# of=0
   1126 	adcx	%rax, $acc0		# cf=0
   1127 
   1128 	#################################
   1129 	mov	$acc1, %rdx
   1130 	mulx	8*4($a_ptr), %rdx, $t0
   1131 
   1132 	mulx	8*0($a_ptr), $t0, $t1
   1133 	adox	$t0, $acc1		# guaranteed to be zero
   1134 	adcx	$t1, $acc2
   1135 	mulx	8*1($a_ptr), $t0, $t1
   1136 	adox	$t0, $acc2
   1137 	adcx	$t1, $acc3
   1138 	mulx	8*2($a_ptr), $t0, $t1
   1139 	adox	$t0, $acc3
   1140 	adcx	$t1, $acc0
   1141 	mulx	8*3($a_ptr), $t0, $t1
   1142 	adox	$t0, $acc0
   1143 	adcx	$t1, $acc1		# cf=0
   1144 	adox	%rax, $acc1		# of=0
   1145 
   1146 	#################################
   1147 	mov	$acc2, %rdx
   1148 	mulx	8*4($a_ptr), %rdx, $t0
   1149 
   1150 	mulx	8*0($a_ptr), $t0, $t1
   1151 	adcx	$t0, $acc2		# guaranteed to be zero
   1152 	adox	$t1, $acc3
   1153 	mulx	8*1($a_ptr), $t0, $t1
   1154 	adcx	$t0, $acc3
   1155 	adox	$t1, $acc0
   1156 	mulx	8*2($a_ptr), $t0, $t1
   1157 	adcx	$t0, $acc0
   1158 	adox	$t1, $acc1
   1159 	mulx	8*3($a_ptr), $t0, $t1
   1160 	adcx	$t0, $acc1
   1161 	adox	$t1, $acc2		# of=0
   1162 	adcx	%rax, $acc2		# cf=0
   1163 
   1164 	#################################
   1165 	mov	$acc3, %rdx
   1166 	mulx	8*4($a_ptr), %rdx, $t0
   1167 
   1168 	mulx	8*0($a_ptr), $t0, $t1
   1169 	adox	$t0, $acc3		# guaranteed to be zero
   1170 	adcx	$t1, $acc0
   1171 	mulx	8*1($a_ptr), $t0, $t1
   1172 	adox	$t0, $acc0
   1173 	adcx	$t1, $acc1
   1174 	mulx	8*2($a_ptr), $t0, $t1
   1175 	adox	$t0, $acc1
   1176 	adcx	$t1, $acc2
   1177 	mulx	8*3($a_ptr), $t0, $t1
   1178 	adox	$t0, $acc2
   1179 	adcx	$t1, $acc3
   1180 	adox	%rax, $acc3
   1181 
   1182 	################################# accumulate upper half
   1183 	add	$acc0, $acc4		# add	$acc4, $acc0
   1184 	adc	$acc5, $acc1
   1185 	 mov	$acc4, %rdx
   1186 	adc	$acc6, $acc2
   1187 	adc	$acc7, $acc3
   1188 	 mov	$acc1, $acc6
   1189 	adc	\$0, %rax
   1190 
   1191 	################################# compare to modulus
   1192 	sub	8*0($a_ptr), $acc4
   1193 	 mov	$acc2, $acc7
   1194 	sbb	8*1($a_ptr), $acc1
   1195 	sbb	8*2($a_ptr), $acc2
   1196 	 mov	$acc3, $acc0
   1197 	sbb	8*3($a_ptr), $acc3
   1198 	sbb	\$0, %rax
   1199 
   1200 	cmovnc	$acc4, %rdx
   1201 	cmovnc	$acc1, $acc6
   1202 	cmovnc	$acc2, $acc7
   1203 	cmovnc	$acc3, $acc0
   1204 
   1205 	dec	$b_ptr
   1206 	jnz	.Loop_ord_sqrx
   1207 
   1208 	mov	%rdx, 8*0($r_ptr)
   1209 	mov	$acc6, 8*1($r_ptr)
   1210 	pxor	%xmm1, %xmm1
   1211 	mov	$acc7, 8*2($r_ptr)
   1212 	pxor	%xmm2, %xmm2
   1213 	mov	$acc0, 8*3($r_ptr)
   1214 	pxor	%xmm3, %xmm3
   1215 
   1216 	mov	0(%rsp),%r15
   1217 .cfi_restore	%r15
   1218 	mov	8(%rsp),%r14
   1219 .cfi_restore	%r14
   1220 	mov	16(%rsp),%r13
   1221 .cfi_restore	%r13
   1222 	mov	24(%rsp),%r12
   1223 .cfi_restore	%r12
   1224 	mov	32(%rsp),%rbx
   1225 .cfi_restore	%rbx
   1226 	mov	40(%rsp),%rbp
   1227 .cfi_restore	%rbp
   1228 	lea	48(%rsp),%rsp
   1229 .cfi_adjust_cfa_offset	-48
   1230 .Lord_sqrx_epilogue:
   1231 	ret
   1232 .cfi_endproc
   1233 .size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
   1234 ___
   1235 
   1236 $code.=<<___;
   1237 ################################################################################
   1238 # void ecp_nistz256_mul_mont(
   1239 #   uint64_t res[4],
   1240 #   uint64_t a[4],
   1241 #   uint64_t b[4]);
   1242 
   1243 .globl	ecp_nistz256_mul_mont
   1244 .type	ecp_nistz256_mul_mont,\@function,3
   1245 .align	32
   1246 ecp_nistz256_mul_mont:
   1247 .cfi_startproc
   1248 ___
   1249 $code.=<<___	if ($addx);
   1250 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
   1251 	mov	8(%rcx), %rcx
   1252 	and	\$0x80100, %ecx
   1253 ___
   1254 $code.=<<___;
   1255 .Lmul_mont:
   1256 	push	%rbp
   1257 .cfi_push	%rbp
   1258 	push	%rbx
   1259 .cfi_push	%rbx
   1260 	push	%r12
   1261 .cfi_push	%r12
   1262 	push	%r13
   1263 .cfi_push	%r13
   1264 	push	%r14
   1265 .cfi_push	%r14
   1266 	push	%r15
   1267 .cfi_push	%r15
   1268 .Lmul_body:
   1269 ___
   1270 $code.=<<___	if ($addx);
   1271 	cmp	\$0x80100, %ecx
   1272 	je	.Lmul_montx
   1273 ___
   1274 $code.=<<___;
   1275 	mov	$b_org, $b_ptr
   1276 	mov	8*0($b_org), %rax
   1277 	mov	8*0($a_ptr), $acc1
   1278 	mov	8*1($a_ptr), $acc2
   1279 	mov	8*2($a_ptr), $acc3
   1280 	mov	8*3($a_ptr), $acc4
   1281 
   1282 	call	__ecp_nistz256_mul_montq
   1283 ___
   1284 $code.=<<___	if ($addx);
   1285 	jmp	.Lmul_mont_done
   1286 
   1287 .align	32
   1288 .Lmul_montx:
   1289 	mov	$b_org, $b_ptr
   1290 	mov	8*0($b_org), %rdx
   1291 	mov	8*0($a_ptr), $acc1
   1292 	mov	8*1($a_ptr), $acc2
   1293 	mov	8*2($a_ptr), $acc3
   1294 	mov	8*3($a_ptr), $acc4
   1295 	lea	-128($a_ptr), $a_ptr	# control u-op density
   1296 
   1297 	call	__ecp_nistz256_mul_montx
   1298 ___
   1299 $code.=<<___;
   1300 .Lmul_mont_done:
   1301 	mov	0(%rsp),%r15
   1302 .cfi_restore	%r15
   1303 	mov	8(%rsp),%r14
   1304 .cfi_restore	%r14
   1305 	mov	16(%rsp),%r13
   1306 .cfi_restore	%r13
   1307 	mov	24(%rsp),%r12
   1308 .cfi_restore	%r12
   1309 	mov	32(%rsp),%rbx
   1310 .cfi_restore	%rbx
   1311 	mov	40(%rsp),%rbp
   1312 .cfi_restore	%rbp
   1313 	lea	48(%rsp),%rsp
   1314 .cfi_adjust_cfa_offset	-48
   1315 .Lmul_epilogue:
   1316 	ret
   1317 .cfi_endproc
   1318 .size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
   1319 
   1320 .type	__ecp_nistz256_mul_montq,\@abi-omnipotent
   1321 .align	32
   1322 __ecp_nistz256_mul_montq:
   1323 .cfi_startproc
   1324 	########################################################################
   1325 	# Multiply a by b[0]
   1326 	mov	%rax, $t1
   1327 	mulq	$acc1
   1328 	mov	.Lpoly+8*1(%rip),$poly1
   1329 	mov	%rax, $acc0
   1330 	mov	$t1, %rax
   1331 	mov	%rdx, $acc1
   1332 
   1333 	mulq	$acc2
   1334 	mov	.Lpoly+8*3(%rip),$poly3
   1335 	add	%rax, $acc1
   1336 	mov	$t1, %rax
   1337 	adc	\$0, %rdx
   1338 	mov	%rdx, $acc2
   1339 
   1340 	mulq	$acc3
   1341 	add	%rax, $acc2
   1342 	mov	$t1, %rax
   1343 	adc	\$0, %rdx
   1344 	mov	%rdx, $acc3
   1345 
   1346 	mulq	$acc4
   1347 	add	%rax, $acc3
   1348 	 mov	$acc0, %rax
   1349 	adc	\$0, %rdx
   1350 	xor	$acc5, $acc5
   1351 	mov	%rdx, $acc4
   1352 
   1353 	########################################################################
   1354 	# First reduction step
   1355 	# Basically now we want to multiply acc[0] by p256,
   1356 	# and add the result to the acc.
   1357 	# Due to the special form of p256 we do some optimizations
   1358 	#
   1359 	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
   1360 	# then we add acc[0] and get acc[0] x 2^96
   1361 
   1362 	mov	$acc0, $t1
   1363 	shl	\$32, $acc0
   1364 	mulq	$poly3
   1365 	shr	\$32, $t1
   1366 	add	$acc0, $acc1		# +=acc[0]<<96
   1367 	adc	$t1, $acc2
   1368 	adc	%rax, $acc3
   1369 	 mov	8*1($b_ptr), %rax
   1370 	adc	%rdx, $acc4
   1371 	adc	\$0, $acc5
   1372 	xor	$acc0, $acc0
   1373 
   1374 	########################################################################
   1375 	# Multiply by b[1]
   1376 	mov	%rax, $t1
   1377 	mulq	8*0($a_ptr)
   1378 	add	%rax, $acc1
   1379 	mov	$t1, %rax
   1380 	adc	\$0, %rdx
   1381 	mov	%rdx, $t0
   1382 
   1383 	mulq	8*1($a_ptr)
   1384 	add	$t0, $acc2
   1385 	adc	\$0, %rdx
   1386 	add	%rax, $acc2
   1387 	mov	$t1, %rax
   1388 	adc	\$0, %rdx
   1389 	mov	%rdx, $t0
   1390 
   1391 	mulq	8*2($a_ptr)
   1392 	add	$t0, $acc3
   1393 	adc	\$0, %rdx
   1394 	add	%rax, $acc3
   1395 	mov	$t1, %rax
   1396 	adc	\$0, %rdx
   1397 	mov	%rdx, $t0
   1398 
   1399 	mulq	8*3($a_ptr)
   1400 	add	$t0, $acc4
   1401 	adc	\$0, %rdx
   1402 	add	%rax, $acc4
   1403 	 mov	$acc1, %rax
   1404 	adc	%rdx, $acc5
   1405 	adc	\$0, $acc0
   1406 
   1407 	########################################################################
   1408 	# Second reduction step
   1409 	mov	$acc1, $t1
   1410 	shl	\$32, $acc1
   1411 	mulq	$poly3
   1412 	shr	\$32, $t1
   1413 	add	$acc1, $acc2
   1414 	adc	$t1, $acc3
   1415 	adc	%rax, $acc4
   1416 	 mov	8*2($b_ptr), %rax
   1417 	adc	%rdx, $acc5
   1418 	adc	\$0, $acc0
   1419 	xor	$acc1, $acc1
   1420 
   1421 	########################################################################
   1422 	# Multiply by b[2]
   1423 	mov	%rax, $t1
   1424 	mulq	8*0($a_ptr)
   1425 	add	%rax, $acc2
   1426 	mov	$t1, %rax
   1427 	adc	\$0, %rdx
   1428 	mov	%rdx, $t0
   1429 
   1430 	mulq	8*1($a_ptr)
   1431 	add	$t0, $acc3
   1432 	adc	\$0, %rdx
   1433 	add	%rax, $acc3
   1434 	mov	$t1, %rax
   1435 	adc	\$0, %rdx
   1436 	mov	%rdx, $t0
   1437 
   1438 	mulq	8*2($a_ptr)
   1439 	add	$t0, $acc4
   1440 	adc	\$0, %rdx
   1441 	add	%rax, $acc4
   1442 	mov	$t1, %rax
   1443 	adc	\$0, %rdx
   1444 	mov	%rdx, $t0
   1445 
   1446 	mulq	8*3($a_ptr)
   1447 	add	$t0, $acc5
   1448 	adc	\$0, %rdx
   1449 	add	%rax, $acc5
   1450 	 mov	$acc2, %rax
   1451 	adc	%rdx, $acc0
   1452 	adc	\$0, $acc1
   1453 
   1454 	########################################################################
   1455 	# Third reduction step
   1456 	mov	$acc2, $t1
   1457 	shl	\$32, $acc2
   1458 	mulq	$poly3
   1459 	shr	\$32, $t1
   1460 	add	$acc2, $acc3
   1461 	adc	$t1, $acc4
   1462 	adc	%rax, $acc5
   1463 	 mov	8*3($b_ptr), %rax
   1464 	adc	%rdx, $acc0
   1465 	adc	\$0, $acc1
   1466 	xor	$acc2, $acc2
   1467 
   1468 	########################################################################
   1469 	# Multiply by b[3]
   1470 	mov	%rax, $t1
   1471 	mulq	8*0($a_ptr)
   1472 	add	%rax, $acc3
   1473 	mov	$t1, %rax
   1474 	adc	\$0, %rdx
   1475 	mov	%rdx, $t0
   1476 
   1477 	mulq	8*1($a_ptr)
   1478 	add	$t0, $acc4
   1479 	adc	\$0, %rdx
   1480 	add	%rax, $acc4
   1481 	mov	$t1, %rax
   1482 	adc	\$0, %rdx
   1483 	mov	%rdx, $t0
   1484 
   1485 	mulq	8*2($a_ptr)
   1486 	add	$t0, $acc5
   1487 	adc	\$0, %rdx
   1488 	add	%rax, $acc5
   1489 	mov	$t1, %rax
   1490 	adc	\$0, %rdx
   1491 	mov	%rdx, $t0
   1492 
   1493 	mulq	8*3($a_ptr)
   1494 	add	$t0, $acc0
   1495 	adc	\$0, %rdx
   1496 	add	%rax, $acc0
   1497 	 mov	$acc3, %rax
   1498 	adc	%rdx, $acc1
   1499 	adc	\$0, $acc2
   1500 
   1501 	########################################################################
   1502 	# Final reduction step
   1503 	mov	$acc3, $t1
   1504 	shl	\$32, $acc3
   1505 	mulq	$poly3
   1506 	shr	\$32, $t1
   1507 	add	$acc3, $acc4
   1508 	adc	$t1, $acc5
   1509 	 mov	$acc4, $t0
   1510 	adc	%rax, $acc0
   1511 	adc	%rdx, $acc1
   1512 	 mov	$acc5, $t1
   1513 	adc	\$0, $acc2
   1514 
   1515 	########################################################################
   1516 	# Branch-less conditional subtraction of P
   1517 	sub	\$-1, $acc4		# .Lpoly[0]
   1518 	 mov	$acc0, $t2
   1519 	sbb	$poly1, $acc5		# .Lpoly[1]
   1520 	sbb	\$0, $acc0		# .Lpoly[2]
   1521 	 mov	$acc1, $t3
   1522 	sbb	$poly3, $acc1		# .Lpoly[3]
   1523 	sbb	\$0, $acc2
   1524 
   1525 	cmovc	$t0, $acc4
   1526 	cmovc	$t1, $acc5
   1527 	mov	$acc4, 8*0($r_ptr)
   1528 	cmovc	$t2, $acc0
   1529 	mov	$acc5, 8*1($r_ptr)
   1530 	cmovc	$t3, $acc1
   1531 	mov	$acc0, 8*2($r_ptr)
   1532 	mov	$acc1, 8*3($r_ptr)
   1533 
   1534 	ret
   1535 .cfi_endproc
   1536 .size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
   1537 
   1538 ################################################################################
   1539 # void ecp_nistz256_sqr_mont(
   1540 #   uint64_t res[4],
   1541 #   uint64_t a[4]);
   1542 
   1543 # we optimize the square according to S.Gueron and V.Krasnov,
   1544 # "Speeding up Big-Number Squaring"
   1545 .globl	ecp_nistz256_sqr_mont
   1546 .type	ecp_nistz256_sqr_mont,\@function,2
   1547 .align	32
   1548 ecp_nistz256_sqr_mont:
   1549 .cfi_startproc
   1550 ___
   1551 $code.=<<___	if ($addx);
   1552 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
   1553 	mov	8(%rcx), %rcx
   1554 	and	\$0x80100, %ecx
   1555 ___
   1556 $code.=<<___;
   1557 	push	%rbp
   1558 .cfi_push	%rbp
   1559 	push	%rbx
   1560 .cfi_push	%rbx
   1561 	push	%r12
   1562 .cfi_push	%r12
   1563 	push	%r13
   1564 .cfi_push	%r13
   1565 	push	%r14
   1566 .cfi_push	%r14
   1567 	push	%r15
   1568 .cfi_push	%r15
   1569 .Lsqr_body:
   1570 ___
   1571 $code.=<<___	if ($addx);
   1572 	cmp	\$0x80100, %ecx
   1573 	je	.Lsqr_montx
   1574 ___
   1575 $code.=<<___;
   1576 	mov	8*0($a_ptr), %rax
   1577 	mov	8*1($a_ptr), $acc6
   1578 	mov	8*2($a_ptr), $acc7
   1579 	mov	8*3($a_ptr), $acc0
   1580 
   1581 	call	__ecp_nistz256_sqr_montq
   1582 ___
   1583 $code.=<<___	if ($addx);
   1584 	jmp	.Lsqr_mont_done
   1585 
   1586 .align	32
   1587 .Lsqr_montx:
   1588 	mov	8*0($a_ptr), %rdx
   1589 	mov	8*1($a_ptr), $acc6
   1590 	mov	8*2($a_ptr), $acc7
   1591 	mov	8*3($a_ptr), $acc0
   1592 	lea	-128($a_ptr), $a_ptr	# control u-op density
   1593 
   1594 	call	__ecp_nistz256_sqr_montx
   1595 ___
   1596 $code.=<<___;
   1597 .Lsqr_mont_done:
   1598 	mov	0(%rsp),%r15
   1599 .cfi_restore	%r15
   1600 	mov	8(%rsp),%r14
   1601 .cfi_restore	%r14
   1602 	mov	16(%rsp),%r13
   1603 .cfi_restore	%r13
   1604 	mov	24(%rsp),%r12
   1605 .cfi_restore	%r12
   1606 	mov	32(%rsp),%rbx
   1607 .cfi_restore	%rbx
   1608 	mov	40(%rsp),%rbp
   1609 .cfi_restore	%rbp
   1610 	lea	48(%rsp),%rsp
   1611 .cfi_adjust_cfa_offset	-48
   1612 .Lsqr_epilogue:
   1613 	ret
   1614 .cfi_endproc
   1615 .size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
   1616 
   1617 .type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
   1618 .align	32
   1619 __ecp_nistz256_sqr_montq:
   1620 .cfi_startproc
   1621 	mov	%rax, $acc5
   1622 	mulq	$acc6			# a[1]*a[0]
   1623 	mov	%rax, $acc1
   1624 	mov	$acc7, %rax
   1625 	mov	%rdx, $acc2
   1626 
   1627 	mulq	$acc5			# a[0]*a[2]
   1628 	add	%rax, $acc2
   1629 	mov	$acc0, %rax
   1630 	adc	\$0, %rdx
   1631 	mov	%rdx, $acc3
   1632 
   1633 	mulq	$acc5			# a[0]*a[3]
   1634 	add	%rax, $acc3
   1635 	 mov	$acc7, %rax
   1636 	adc	\$0, %rdx
   1637 	mov	%rdx, $acc4
   1638 
   1639 	#################################
   1640 	mulq	$acc6			# a[1]*a[2]
   1641 	add	%rax, $acc3
   1642 	mov	$acc0, %rax
   1643 	adc	\$0, %rdx
   1644 	mov	%rdx, $t1
   1645 
   1646 	mulq	$acc6			# a[1]*a[3]
   1647 	add	%rax, $acc4
   1648 	 mov	$acc0, %rax
   1649 	adc	\$0, %rdx
   1650 	add	$t1, $acc4
   1651 	mov	%rdx, $acc5
   1652 	adc	\$0, $acc5
   1653 
   1654 	#################################
   1655 	mulq	$acc7			# a[2]*a[3]
   1656 	xor	$acc7, $acc7
   1657 	add	%rax, $acc5
   1658 	 mov	8*0($a_ptr), %rax
   1659 	mov	%rdx, $acc6
   1660 	adc	\$0, $acc6
   1661 
   1662 	add	$acc1, $acc1		# acc1:6<<1
   1663 	adc	$acc2, $acc2
   1664 	adc	$acc3, $acc3
   1665 	adc	$acc4, $acc4
   1666 	adc	$acc5, $acc5
   1667 	adc	$acc6, $acc6
   1668 	adc	\$0, $acc7
   1669 
   1670 	mulq	%rax
   1671 	mov	%rax, $acc0
   1672 	mov	8*1($a_ptr), %rax
   1673 	mov	%rdx, $t0
   1674 
   1675 	mulq	%rax
   1676 	add	$t0, $acc1
   1677 	adc	%rax, $acc2
   1678 	mov	8*2($a_ptr), %rax
   1679 	adc	\$0, %rdx
   1680 	mov	%rdx, $t0
   1681 
   1682 	mulq	%rax
   1683 	add	$t0, $acc3
   1684 	adc	%rax, $acc4
   1685 	mov	8*3($a_ptr), %rax
   1686 	adc	\$0, %rdx
   1687 	mov	%rdx, $t0
   1688 
   1689 	mulq	%rax
   1690 	add	$t0, $acc5
   1691 	adc	%rax, $acc6
   1692 	 mov	$acc0, %rax
   1693 	adc	%rdx, $acc7
   1694 
   1695 	mov	.Lpoly+8*1(%rip), $a_ptr
   1696 	mov	.Lpoly+8*3(%rip), $t1
   1697 
   1698 	##########################################
   1699 	# Now the reduction
   1700 	# First iteration
   1701 	mov	$acc0, $t0
   1702 	shl	\$32, $acc0
   1703 	mulq	$t1
   1704 	shr	\$32, $t0
   1705 	add	$acc0, $acc1		# +=acc[0]<<96
   1706 	adc	$t0, $acc2
   1707 	adc	%rax, $acc3
   1708 	 mov	$acc1, %rax
   1709 	adc	\$0, %rdx
   1710 
   1711 	##########################################
   1712 	# Second iteration
   1713 	mov	$acc1, $t0
   1714 	shl	\$32, $acc1
   1715 	mov	%rdx, $acc0
   1716 	mulq	$t1
   1717 	shr	\$32, $t0
   1718 	add	$acc1, $acc2
   1719 	adc	$t0, $acc3
   1720 	adc	%rax, $acc0
   1721 	 mov	$acc2, %rax
   1722 	adc	\$0, %rdx
   1723 
   1724 	##########################################
   1725 	# Third iteration
   1726 	mov	$acc2, $t0
   1727 	shl	\$32, $acc2
   1728 	mov	%rdx, $acc1
   1729 	mulq	$t1
   1730 	shr	\$32, $t0
   1731 	add	$acc2, $acc3
   1732 	adc	$t0, $acc0
   1733 	adc	%rax, $acc1
   1734 	 mov	$acc3, %rax
   1735 	adc	\$0, %rdx
   1736 
   1737 	###########################################
   1738 	# Last iteration
   1739 	mov	$acc3, $t0
   1740 	shl	\$32, $acc3
   1741 	mov	%rdx, $acc2
   1742 	mulq	$t1
   1743 	shr	\$32, $t0
   1744 	add	$acc3, $acc0
   1745 	adc	$t0, $acc1
   1746 	adc	%rax, $acc2
   1747 	adc	\$0, %rdx
   1748 	xor	$acc3, $acc3
   1749 
   1750 	############################################
   1751 	# Add the rest of the acc
   1752 	add	$acc0, $acc4
   1753 	adc	$acc1, $acc5
   1754 	 mov	$acc4, $acc0
   1755 	adc	$acc2, $acc6
   1756 	adc	%rdx, $acc7
   1757 	 mov	$acc5, $acc1
   1758 	adc	\$0, $acc3
   1759 
   1760 	sub	\$-1, $acc4		# .Lpoly[0]
   1761 	 mov	$acc6, $acc2
   1762 	sbb	$a_ptr, $acc5		# .Lpoly[1]
   1763 	sbb	\$0, $acc6		# .Lpoly[2]
   1764 	 mov	$acc7, $t0
   1765 	sbb	$t1, $acc7		# .Lpoly[3]
   1766 	sbb	\$0, $acc3
   1767 
   1768 	cmovc	$acc0, $acc4
   1769 	cmovc	$acc1, $acc5
   1770 	mov	$acc4, 8*0($r_ptr)
   1771 	cmovc	$acc2, $acc6
   1772 	mov	$acc5, 8*1($r_ptr)
   1773 	cmovc	$t0, $acc7
   1774 	mov	$acc6, 8*2($r_ptr)
   1775 	mov	$acc7, 8*3($r_ptr)
   1776 
   1777 	ret
   1778 .cfi_endproc
   1779 .size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
   1780 ___
   1781 
   1782 if ($addx) {
   1783 $code.=<<___;
   1784 .type	__ecp_nistz256_mul_montx,\@abi-omnipotent
   1785 .align	32
   1786 __ecp_nistz256_mul_montx:
   1787 .cfi_startproc
   1788 	########################################################################
   1789 	# Multiply by b[0]
   1790 	mulx	$acc1, $acc0, $acc1
   1791 	mulx	$acc2, $t0, $acc2
   1792 	mov	\$32, $poly1
   1793 	xor	$acc5, $acc5		# cf=0
   1794 	mulx	$acc3, $t1, $acc3
   1795 	mov	.Lpoly+8*3(%rip), $poly3
   1796 	adc	$t0, $acc1
   1797 	mulx	$acc4, $t0, $acc4
   1798 	 mov	$acc0, %rdx
   1799 	adc	$t1, $acc2
   1800 	 shlx	$poly1,$acc0,$t1
   1801 	adc	$t0, $acc3
   1802 	 shrx	$poly1,$acc0,$t0
   1803 	adc	\$0, $acc4
   1804 
   1805 	########################################################################
   1806 	# First reduction step
   1807 	add	$t1, $acc1
   1808 	adc	$t0, $acc2
   1809 
   1810 	mulx	$poly3, $t0, $t1
   1811 	 mov	8*1($b_ptr), %rdx
   1812 	adc	$t0, $acc3
   1813 	adc	$t1, $acc4
   1814 	adc	\$0, $acc5
   1815 	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
   1816 
   1817 	########################################################################
   1818 	# Multiply by b[1]
   1819 	mulx	8*0+128($a_ptr), $t0, $t1
   1820 	adcx	$t0, $acc1
   1821 	adox	$t1, $acc2
   1822 
   1823 	mulx	8*1+128($a_ptr), $t0, $t1
   1824 	adcx	$t0, $acc2
   1825 	adox	$t1, $acc3
   1826 
   1827 	mulx	8*2+128($a_ptr), $t0, $t1
   1828 	adcx	$t0, $acc3
   1829 	adox	$t1, $acc4
   1830 
   1831 	mulx	8*3+128($a_ptr), $t0, $t1
   1832 	 mov	$acc1, %rdx
   1833 	adcx	$t0, $acc4
   1834 	 shlx	$poly1, $acc1, $t0
   1835 	adox	$t1, $acc5
   1836 	 shrx	$poly1, $acc1, $t1
   1837 
   1838 	adcx	$acc0, $acc5
   1839 	adox	$acc0, $acc0
   1840 	adc	\$0, $acc0
   1841 
   1842 	########################################################################
   1843 	# Second reduction step
   1844 	add	$t0, $acc2
   1845 	adc	$t1, $acc3
   1846 
   1847 	mulx	$poly3, $t0, $t1
   1848 	 mov	8*2($b_ptr), %rdx
   1849 	adc	$t0, $acc4
   1850 	adc	$t1, $acc5
   1851 	adc	\$0, $acc0
   1852 	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
   1853 
   1854 	########################################################################
   1855 	# Multiply by b[2]
   1856 	mulx	8*0+128($a_ptr), $t0, $t1
   1857 	adcx	$t0, $acc2
   1858 	adox	$t1, $acc3
   1859 
   1860 	mulx	8*1+128($a_ptr), $t0, $t1
   1861 	adcx	$t0, $acc3
   1862 	adox	$t1, $acc4
   1863 
   1864 	mulx	8*2+128($a_ptr), $t0, $t1
   1865 	adcx	$t0, $acc4
   1866 	adox	$t1, $acc5
   1867 
   1868 	mulx	8*3+128($a_ptr), $t0, $t1
   1869 	 mov	$acc2, %rdx
   1870 	adcx	$t0, $acc5
   1871 	 shlx	$poly1, $acc2, $t0
   1872 	adox	$t1, $acc0
   1873 	 shrx	$poly1, $acc2, $t1
   1874 
   1875 	adcx	$acc1, $acc0
   1876 	adox	$acc1, $acc1
   1877 	adc	\$0, $acc1
   1878 
   1879 	########################################################################
   1880 	# Third reduction step
   1881 	add	$t0, $acc3
   1882 	adc	$t1, $acc4
   1883 
   1884 	mulx	$poly3, $t0, $t1
   1885 	 mov	8*3($b_ptr), %rdx
   1886 	adc	$t0, $acc5
   1887 	adc	$t1, $acc0
   1888 	adc	\$0, $acc1
   1889 	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
   1890 
   1891 	########################################################################
   1892 	# Multiply by b[3]
   1893 	mulx	8*0+128($a_ptr), $t0, $t1
   1894 	adcx	$t0, $acc3
   1895 	adox	$t1, $acc4
   1896 
   1897 	mulx	8*1+128($a_ptr), $t0, $t1
   1898 	adcx	$t0, $acc4
   1899 	adox	$t1, $acc5
   1900 
   1901 	mulx	8*2+128($a_ptr), $t0, $t1
   1902 	adcx	$t0, $acc5
   1903 	adox	$t1, $acc0
   1904 
   1905 	mulx	8*3+128($a_ptr), $t0, $t1
   1906 	 mov	$acc3, %rdx
   1907 	adcx	$t0, $acc0
   1908 	 shlx	$poly1, $acc3, $t0
   1909 	adox	$t1, $acc1
   1910 	 shrx	$poly1, $acc3, $t1
   1911 
   1912 	adcx	$acc2, $acc1
   1913 	adox	$acc2, $acc2
   1914 	adc	\$0, $acc2
   1915 
   1916 	########################################################################
   1917 	# Fourth reduction step
   1918 	add	$t0, $acc4
   1919 	adc	$t1, $acc5
   1920 
   1921 	mulx	$poly3, $t0, $t1
   1922 	 mov	$acc4, $t2
   1923 	mov	.Lpoly+8*1(%rip), $poly1
   1924 	adc	$t0, $acc0
   1925 	 mov	$acc5, $t3
   1926 	adc	$t1, $acc1
   1927 	adc	\$0, $acc2
   1928 
   1929 	########################################################################
   1930 	# Branch-less conditional subtraction of P
   1931 	xor	%eax, %eax
   1932 	 mov	$acc0, $t0
   1933 	sbb	\$-1, $acc4		# .Lpoly[0]
   1934 	sbb	$poly1, $acc5		# .Lpoly[1]
   1935 	sbb	\$0, $acc0		# .Lpoly[2]
   1936 	 mov	$acc1, $t1
   1937 	sbb	$poly3, $acc1		# .Lpoly[3]
   1938 	sbb	\$0, $acc2
   1939 
   1940 	cmovc	$t2, $acc4
   1941 	cmovc	$t3, $acc5
   1942 	mov	$acc4, 8*0($r_ptr)
   1943 	cmovc	$t0, $acc0
   1944 	mov	$acc5, 8*1($r_ptr)
   1945 	cmovc	$t1, $acc1
   1946 	mov	$acc0, 8*2($r_ptr)
   1947 	mov	$acc1, 8*3($r_ptr)
   1948 
   1949 	ret
   1950 .cfi_endproc
   1951 .size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
   1952 
   1953 .type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
   1954 .align	32
   1955 __ecp_nistz256_sqr_montx:
   1956 .cfi_startproc
   1957 	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
   1958 	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
   1959 	xor	%eax, %eax
   1960 	adc	$t0, $acc2
   1961 	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
   1962 	 mov	$acc6, %rdx
   1963 	adc	$t1, $acc3
   1964 	adc	\$0, $acc4
   1965 	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
   1966 
   1967 	#################################
   1968 	mulx	$acc7, $t0, $t1		# a[1]*a[2]
   1969 	adcx	$t0, $acc3
   1970 	adox	$t1, $acc4
   1971 
   1972 	mulx	$acc0, $t0, $t1		# a[1]*a[3]
   1973 	 mov	$acc7, %rdx
   1974 	adcx	$t0, $acc4
   1975 	adox	$t1, $acc5
   1976 	adc	\$0, $acc5
   1977 
   1978 	#################################
   1979 	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
   1980 	 mov	8*0+128($a_ptr), %rdx
   1981 	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
   1982 	 adcx	$acc1, $acc1		# acc1:6<<1
   1983 	adox	$t0, $acc5
   1984 	 adcx	$acc2, $acc2
   1985 	adox	$acc7, $acc6		# of=0
   1986 
   1987 	mulx	%rdx, $acc0, $t1
   1988 	mov	8*1+128($a_ptr), %rdx
   1989 	 adcx	$acc3, $acc3
   1990 	adox	$t1, $acc1
   1991 	 adcx	$acc4, $acc4
   1992 	mulx	%rdx, $t0, $t4
   1993 	mov	8*2+128($a_ptr), %rdx
   1994 	 adcx	$acc5, $acc5
   1995 	adox	$t0, $acc2
   1996 	 adcx	$acc6, $acc6
   1997 	.byte	0x67
   1998 	mulx	%rdx, $t0, $t1
   1999 	mov	8*3+128($a_ptr), %rdx
   2000 	adox	$t4, $acc3
   2001 	 adcx	$acc7, $acc7
   2002 	adox	$t0, $acc4
   2003 	 mov	\$32, $a_ptr
   2004 	adox	$t1, $acc5
   2005 	.byte	0x67,0x67
   2006 	mulx	%rdx, $t0, $t4
   2007 	 mov	.Lpoly+8*3(%rip), %rdx
   2008 	adox	$t0, $acc6
   2009 	 shlx	$a_ptr, $acc0, $t0
   2010 	adox	$t4, $acc7
   2011 	 shrx	$a_ptr, $acc0, $t4
   2012 	mov	%rdx,$t1
   2013 
   2014 	# reduction step 1
   2015 	add	$t0, $acc1
   2016 	adc	$t4, $acc2
   2017 
   2018 	mulx	$acc0, $t0, $acc0
   2019 	adc	$t0, $acc3
   2020 	 shlx	$a_ptr, $acc1, $t0
   2021 	adc	\$0, $acc0
   2022 	 shrx	$a_ptr, $acc1, $t4
   2023 
   2024 	# reduction step 2
   2025 	add	$t0, $acc2
   2026 	adc	$t4, $acc3
   2027 
   2028 	mulx	$acc1, $t0, $acc1
   2029 	adc	$t0, $acc0
   2030 	 shlx	$a_ptr, $acc2, $t0
   2031 	adc	\$0, $acc1
   2032 	 shrx	$a_ptr, $acc2, $t4
   2033 
   2034 	# reduction step 3
   2035 	add	$t0, $acc3
   2036 	adc	$t4, $acc0
   2037 
   2038 	mulx	$acc2, $t0, $acc2
   2039 	adc	$t0, $acc1
   2040 	 shlx	$a_ptr, $acc3, $t0
   2041 	adc	\$0, $acc2
   2042 	 shrx	$a_ptr, $acc3, $t4
   2043 
   2044 	# reduction step 4
   2045 	add	$t0, $acc0
   2046 	adc	$t4, $acc1
   2047 
   2048 	mulx	$acc3, $t0, $acc3
   2049 	adc	$t0, $acc2
   2050 	adc	\$0, $acc3
   2051 
   2052 	xor	$t3, $t3
   2053 	add	$acc0, $acc4		# accumulate upper half
   2054 	 mov	.Lpoly+8*1(%rip), $a_ptr
   2055 	adc	$acc1, $acc5
   2056 	 mov	$acc4, $acc0
   2057 	adc	$acc2, $acc6
   2058 	adc	$acc3, $acc7
   2059 	 mov	$acc5, $acc1
   2060 	adc	\$0, $t3
   2061 
   2062 	sub	\$-1, $acc4		# .Lpoly[0]
   2063 	 mov	$acc6, $acc2
   2064 	sbb	$a_ptr, $acc5		# .Lpoly[1]
   2065 	sbb	\$0, $acc6		# .Lpoly[2]
   2066 	 mov	$acc7, $acc3
   2067 	sbb	$t1, $acc7		# .Lpoly[3]
   2068 	sbb	\$0, $t3
   2069 
   2070 	cmovc	$acc0, $acc4
   2071 	cmovc	$acc1, $acc5
   2072 	mov	$acc4, 8*0($r_ptr)
   2073 	cmovc	$acc2, $acc6
   2074 	mov	$acc5, 8*1($r_ptr)
   2075 	cmovc	$acc3, $acc7
   2076 	mov	$acc6, 8*2($r_ptr)
   2077 	mov	$acc7, 8*3($r_ptr)
   2078 
   2079 	ret
   2080 .cfi_endproc
   2081 .size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
   2082 ___
   2083 }
   2084 }
   2085 {
   2086 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   2087 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
   2088 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
   2089 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
   2090 
   2091 $code.=<<___;
   2092 ################################################################################
   2093 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
   2094 .globl	ecp_nistz256_select_w5
   2095 .type	ecp_nistz256_select_w5,\@abi-omnipotent
   2096 .align	32
   2097 ecp_nistz256_select_w5:
   2098 .cfi_startproc
   2099 ___
   2100 $code.=<<___	if ($avx>1);
   2101 	leaq	OPENSSL_ia32cap_P(%rip), %rax
   2102 	mov	8(%rax), %rax
   2103 	test	\$`1<<5`, %eax
   2104 	jnz	.Lavx2_select_w5
   2105 ___
   2106 $code.=<<___	if ($win64);
   2107 	lea	-0x88(%rsp), %rax
   2108 .LSEH_begin_ecp_nistz256_select_w5:
   2109 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   2110 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
   2111 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
   2112 	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
   2113 	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
   2114 	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
   2115 	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
   2116 	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
   2117 	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
   2118 	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
   2119 	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
   2120 ___
   2121 $code.=<<___;
   2122 	movdqa	.LOne(%rip), $ONE
   2123 	movd	$index, $INDEX
   2124 
   2125 	pxor	$Ra, $Ra
   2126 	pxor	$Rb, $Rb
   2127 	pxor	$Rc, $Rc
   2128 	pxor	$Rd, $Rd
   2129 	pxor	$Re, $Re
   2130 	pxor	$Rf, $Rf
   2131 
   2132 	movdqa	$ONE, $M0
   2133 	pshufd	\$0, $INDEX, $INDEX
   2134 
   2135 	mov	\$16, %rax
   2136 .Lselect_loop_sse_w5:
   2137 
   2138 	movdqa	$M0, $TMP0
   2139 	paddd	$ONE, $M0
   2140 	pcmpeqd $INDEX, $TMP0
   2141 
   2142 	movdqa	16*0($in_t), $T0a
   2143 	movdqa	16*1($in_t), $T0b
   2144 	movdqa	16*2($in_t), $T0c
   2145 	movdqa	16*3($in_t), $T0d
   2146 	movdqa	16*4($in_t), $T0e
   2147 	movdqa	16*5($in_t), $T0f
   2148 	lea 16*6($in_t), $in_t
   2149 
   2150 	pand	$TMP0, $T0a
   2151 	pand	$TMP0, $T0b
   2152 	por	$T0a, $Ra
   2153 	pand	$TMP0, $T0c
   2154 	por	$T0b, $Rb
   2155 	pand	$TMP0, $T0d
   2156 	por	$T0c, $Rc
   2157 	pand	$TMP0, $T0e
   2158 	por	$T0d, $Rd
   2159 	pand	$TMP0, $T0f
   2160 	por	$T0e, $Re
   2161 	por	$T0f, $Rf
   2162 
   2163 	dec	%rax
   2164 	jnz	.Lselect_loop_sse_w5
   2165 
   2166 	movdqu	$Ra, 16*0($val)
   2167 	movdqu	$Rb, 16*1($val)
   2168 	movdqu	$Rc, 16*2($val)
   2169 	movdqu	$Rd, 16*3($val)
   2170 	movdqu	$Re, 16*4($val)
   2171 	movdqu	$Rf, 16*5($val)
   2172 ___
   2173 $code.=<<___	if ($win64);
   2174 	movaps	(%rsp), %xmm6
   2175 	movaps	0x10(%rsp), %xmm7
   2176 	movaps	0x20(%rsp), %xmm8
   2177 	movaps	0x30(%rsp), %xmm9
   2178 	movaps	0x40(%rsp), %xmm10
   2179 	movaps	0x50(%rsp), %xmm11
   2180 	movaps	0x60(%rsp), %xmm12
   2181 	movaps	0x70(%rsp), %xmm13
   2182 	movaps	0x80(%rsp), %xmm14
   2183 	movaps	0x90(%rsp), %xmm15
   2184 	lea	0xa8(%rsp), %rsp
   2185 ___
   2186 $code.=<<___;
   2187 	ret
   2188 .cfi_endproc
   2189 .LSEH_end_ecp_nistz256_select_w5:
   2190 .size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
   2191 
   2192 ################################################################################
   2193 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
   2194 .globl	ecp_nistz256_select_w7
   2195 .type	ecp_nistz256_select_w7,\@abi-omnipotent
   2196 .align	32
   2197 ecp_nistz256_select_w7:
   2198 .cfi_startproc
   2199 ___
   2200 $code.=<<___	if ($avx>1);
   2201 	leaq	OPENSSL_ia32cap_P(%rip), %rax
   2202 	mov	8(%rax), %rax
   2203 	test	\$`1<<5`, %eax
   2204 	jnz	.Lavx2_select_w7
   2205 ___
   2206 $code.=<<___	if ($win64);
   2207 	lea	-0x88(%rsp), %rax
   2208 .LSEH_begin_ecp_nistz256_select_w7:
   2209 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
   2210 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
   2211 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
   2212 	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
   2213 	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
   2214 	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
   2215 	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
   2216 	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
   2217 	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
   2218 	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
   2219 	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
   2220 ___
   2221 $code.=<<___;
   2222 	movdqa	.LOne(%rip), $M0
   2223 	movd	$index, $INDEX
   2224 
   2225 	pxor	$Ra, $Ra
   2226 	pxor	$Rb, $Rb
   2227 	pxor	$Rc, $Rc
   2228 	pxor	$Rd, $Rd
   2229 
   2230 	movdqa	$M0, $ONE
   2231 	pshufd	\$0, $INDEX, $INDEX
   2232 	mov	\$64, %rax
   2233 
   2234 .Lselect_loop_sse_w7:
   2235 	movdqa	$M0, $TMP0
   2236 	paddd	$ONE, $M0
   2237 	movdqa	16*0($in_t), $T0a
   2238 	movdqa	16*1($in_t), $T0b
   2239 	pcmpeqd	$INDEX, $TMP0
   2240 	movdqa	16*2($in_t), $T0c
   2241 	movdqa	16*3($in_t), $T0d
   2242 	lea	16*4($in_t), $in_t
   2243 
   2244 	pand	$TMP0, $T0a
   2245 	pand	$TMP0, $T0b
   2246 	por	$T0a, $Ra
   2247 	pand	$TMP0, $T0c
   2248 	por	$T0b, $Rb
   2249 	pand	$TMP0, $T0d
   2250 	por	$T0c, $Rc
   2251 	prefetcht0	255($in_t)
   2252 	por	$T0d, $Rd
   2253 
   2254 	dec	%rax
   2255 	jnz	.Lselect_loop_sse_w7
   2256 
   2257 	movdqu	$Ra, 16*0($val)
   2258 	movdqu	$Rb, 16*1($val)
   2259 	movdqu	$Rc, 16*2($val)
   2260 	movdqu	$Rd, 16*3($val)
   2261 ___
   2262 $code.=<<___	if ($win64);
   2263 	movaps	(%rsp), %xmm6
   2264 	movaps	0x10(%rsp), %xmm7
   2265 	movaps	0x20(%rsp), %xmm8
   2266 	movaps	0x30(%rsp), %xmm9
   2267 	movaps	0x40(%rsp), %xmm10
   2268 	movaps	0x50(%rsp), %xmm11
   2269 	movaps	0x60(%rsp), %xmm12
   2270 	movaps	0x70(%rsp), %xmm13
   2271 	movaps	0x80(%rsp), %xmm14
   2272 	movaps	0x90(%rsp), %xmm15
   2273 	lea	0xa8(%rsp), %rsp
   2274 ___
   2275 $code.=<<___;
   2276 	ret
   2277 .cfi_endproc
   2278 .LSEH_end_ecp_nistz256_select_w7:
   2279 .size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
   2280 ___
   2281 }
   2282 if ($avx>1) {
   2283 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   2284 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
   2285 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
   2286 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
   2287 
   2288 $code.=<<___;
   2289 ################################################################################
   2290 # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
   2291 .type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
   2292 .align	32
   2293 ecp_nistz256_avx2_select_w5:
   2294 .cfi_startproc
   2295 .Lavx2_select_w5:
   2296 	vzeroupper
   2297 ___
   2298 $code.=<<___	if ($win64);
   2299 	lea	-0x88(%rsp), %rax
   2300 	mov	%rsp,%r11
   2301 .LSEH_begin_ecp_nistz256_avx2_select_w5:
   2302 	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
   2303 	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
   2304 	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
   2305 	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
   2306 	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
   2307 	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
   2308 	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
   2309 	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
   2310 	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
   2311 	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
   2312 	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
   2313 ___
   2314 $code.=<<___;
   2315 	vmovdqa	.LTwo(%rip), $TWO
   2316 
   2317 	vpxor	$Ra, $Ra, $Ra
   2318 	vpxor	$Rb, $Rb, $Rb
   2319 	vpxor	$Rc, $Rc, $Rc
   2320 
   2321 	vmovdqa .LOne(%rip), $M0
   2322 	vmovdqa .LTwo(%rip), $M1
   2323 
   2324 	vmovd	$index, %xmm1
   2325 	vpermd	$INDEX, $Ra, $INDEX
   2326 
   2327 	mov	\$8, %rax
   2328 .Lselect_loop_avx2_w5:
   2329 
   2330 	vmovdqa	32*0($in_t), $T0a
   2331 	vmovdqa	32*1($in_t), $T0b
   2332 	vmovdqa	32*2($in_t), $T0c
   2333 
   2334 	vmovdqa	32*3($in_t), $T1a
   2335 	vmovdqa	32*4($in_t), $T1b
   2336 	vmovdqa	32*5($in_t), $T1c
   2337 
   2338 	vpcmpeqd	$INDEX, $M0, $TMP0
   2339 	vpcmpeqd	$INDEX, $M1, $TMP1
   2340 
   2341 	vpaddd	$TWO, $M0, $M0
   2342 	vpaddd	$TWO, $M1, $M1
   2343 	lea	32*6($in_t), $in_t
   2344 
   2345 	vpand	$TMP0, $T0a, $T0a
   2346 	vpand	$TMP0, $T0b, $T0b
   2347 	vpand	$TMP0, $T0c, $T0c
   2348 	vpand	$TMP1, $T1a, $T1a
   2349 	vpand	$TMP1, $T1b, $T1b
   2350 	vpand	$TMP1, $T1c, $T1c
   2351 
   2352 	vpxor	$T0a, $Ra, $Ra
   2353 	vpxor	$T0b, $Rb, $Rb
   2354 	vpxor	$T0c, $Rc, $Rc
   2355 	vpxor	$T1a, $Ra, $Ra
   2356 	vpxor	$T1b, $Rb, $Rb
   2357 	vpxor	$T1c, $Rc, $Rc
   2358 
   2359 	dec %rax
   2360 	jnz .Lselect_loop_avx2_w5
   2361 
   2362 	vmovdqu $Ra, 32*0($val)
   2363 	vmovdqu $Rb, 32*1($val)
   2364 	vmovdqu $Rc, 32*2($val)
   2365 	vzeroupper
   2366 ___
   2367 $code.=<<___	if ($win64);
   2368 	movaps	(%rsp), %xmm6
   2369 	movaps	0x10(%rsp), %xmm7
   2370 	movaps	0x20(%rsp), %xmm8
   2371 	movaps	0x30(%rsp), %xmm9
   2372 	movaps	0x40(%rsp), %xmm10
   2373 	movaps	0x50(%rsp), %xmm11
   2374 	movaps	0x60(%rsp), %xmm12
   2375 	movaps	0x70(%rsp), %xmm13
   2376 	movaps	0x80(%rsp), %xmm14
   2377 	movaps	0x90(%rsp), %xmm15
   2378 	lea	(%r11), %rsp
   2379 ___
   2380 $code.=<<___;
   2381 	ret
   2382 .cfi_endproc
   2383 .LSEH_end_ecp_nistz256_avx2_select_w5:
   2384 .size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
   2385 ___
   2386 }
   2387 if ($avx>1) {
   2388 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
   2389 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
   2390 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
   2391 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
   2392 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
   2393 
   2394 $code.=<<___;
   2395 
   2396 ################################################################################
   2397 # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
   2398 .globl	ecp_nistz256_avx2_select_w7
   2399 .type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
   2400 .align	32
   2401 ecp_nistz256_avx2_select_w7:
   2402 .cfi_startproc
   2403 .Lavx2_select_w7:
   2404 	vzeroupper
   2405 ___
   2406 $code.=<<___	if ($win64);
   2407 	mov	%rsp,%r11
   2408 	lea	-0x88(%rsp), %rax
   2409 .LSEH_begin_ecp_nistz256_avx2_select_w7:
   2410 	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
   2411 	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
   2412 	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
   2413 	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
   2414 	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
   2415 	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
   2416 	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
   2417 	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
   2418 	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
   2419 	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
   2420 	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
   2421 ___
   2422 $code.=<<___;
   2423 	vmovdqa	.LThree(%rip), $THREE
   2424 
   2425 	vpxor	$Ra, $Ra, $Ra
   2426 	vpxor	$Rb, $Rb, $Rb
   2427 
   2428 	vmovdqa .LOne(%rip), $M0
   2429 	vmovdqa .LTwo(%rip), $M1
   2430 	vmovdqa .LThree(%rip), $M2
   2431 
   2432 	vmovd	$index, %xmm1
   2433 	vpermd	$INDEX, $Ra, $INDEX
   2434 	# Skip index = 0, because it is implicitly the point at infinity
   2435 
   2436 	mov	\$21, %rax
   2437 .Lselect_loop_avx2_w7:
   2438 
   2439 	vmovdqa	32*0($in_t), $T0a
   2440 	vmovdqa	32*1($in_t), $T0b
   2441 
   2442 	vmovdqa	32*2($in_t), $T1a
   2443 	vmovdqa	32*3($in_t), $T1b
   2444 
   2445 	vmovdqa	32*4($in_t), $T2a
   2446 	vmovdqa	32*5($in_t), $T2b
   2447 
   2448 	vpcmpeqd	$INDEX, $M0, $TMP0
   2449 	vpcmpeqd	$INDEX, $M1, $TMP1
   2450 	vpcmpeqd	$INDEX, $M2, $TMP2
   2451 
   2452 	vpaddd	$THREE, $M0, $M0
   2453 	vpaddd	$THREE, $M1, $M1
   2454 	vpaddd	$THREE, $M2, $M2
   2455 	lea	32*6($in_t), $in_t
   2456 
   2457 	vpand	$TMP0, $T0a, $T0a
   2458 	vpand	$TMP0, $T0b, $T0b
   2459 	vpand	$TMP1, $T1a, $T1a
   2460 	vpand	$TMP1, $T1b, $T1b
   2461 	vpand	$TMP2, $T2a, $T2a
   2462 	vpand	$TMP2, $T2b, $T2b
   2463 
   2464 	vpxor	$T0a, $Ra, $Ra
   2465 	vpxor	$T0b, $Rb, $Rb
   2466 	vpxor	$T1a, $Ra, $Ra
   2467 	vpxor	$T1b, $Rb, $Rb
   2468 	vpxor	$T2a, $Ra, $Ra
   2469 	vpxor	$T2b, $Rb, $Rb
   2470 
   2471 	dec %rax
   2472 	jnz .Lselect_loop_avx2_w7
   2473 
   2474 
   2475 	vmovdqa	32*0($in_t), $T0a
   2476 	vmovdqa	32*1($in_t), $T0b
   2477 
   2478 	vpcmpeqd	$INDEX, $M0, $TMP0
   2479 
   2480 	vpand	$TMP0, $T0a, $T0a
   2481 	vpand	$TMP0, $T0b, $T0b
   2482 
   2483 	vpxor	$T0a, $Ra, $Ra
   2484 	vpxor	$T0b, $Rb, $Rb
   2485 
   2486 	vmovdqu $Ra, 32*0($val)
   2487 	vmovdqu $Rb, 32*1($val)
   2488 	vzeroupper
   2489 ___
   2490 $code.=<<___	if ($win64);
   2491 	movaps	(%rsp), %xmm6
   2492 	movaps	0x10(%rsp), %xmm7
   2493 	movaps	0x20(%rsp), %xmm8
   2494 	movaps	0x30(%rsp), %xmm9
   2495 	movaps	0x40(%rsp), %xmm10
   2496 	movaps	0x50(%rsp), %xmm11
   2497 	movaps	0x60(%rsp), %xmm12
   2498 	movaps	0x70(%rsp), %xmm13
   2499 	movaps	0x80(%rsp), %xmm14
   2500 	movaps	0x90(%rsp), %xmm15
   2501 	lea	(%r11), %rsp
   2502 ___
   2503 $code.=<<___;
   2504 	ret
   2505 .cfi_endproc
   2506 .LSEH_end_ecp_nistz256_avx2_select_w7:
   2507 .size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
   2508 ___
   2509 } else {
   2510 $code.=<<___;
   2511 .globl	ecp_nistz256_avx2_select_w7
   2512 .type	ecp_nistz256_avx2_select_w7,\@function,3
   2513 .align	32
   2514 ecp_nistz256_avx2_select_w7:
   2515 	.byte	0x0f,0x0b	# ud2
   2516 	ret
   2517 .size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
   2518 ___
   2519 }
   2520 {{{
   2521 ########################################################################
   2522 # This block implements higher level point_double, point_add and
   2523 # point_add_affine. The key to performance in this case is to allow
   2524 # out-of-order execution logic to overlap computations from next step
   2525 # with tail processing from current step. By using tailored calling
   2526 # sequence we minimize inter-step overhead to give processor better
   2527 # shot at overlapping operations...
   2528 #
   2529 # You will notice that input data is copied to stack. Trouble is that
   2530 # there are no registers to spare for holding original pointers and
   2531 # reloading them, pointers, would create undesired dependencies on
   2532 # effective addresses calculation paths. In other words it's too done
   2533 # to favour out-of-order execution logic.
   2534 #						<appro (at] openssl.org>
   2535 
   2536 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
   2537 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
   2538 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
   2539 my ($poly1,$poly3)=($acc6,$acc7);
   2540 
   2541 sub load_for_mul () {
   2542 my ($a,$b,$src0) = @_;
   2543 my $bias = $src0 eq "%rax" ? 0 : -128;
   2544 
   2545 "	mov	$b, $src0
   2546 	lea	$b, $b_ptr
   2547 	mov	8*0+$a, $acc1
   2548 	mov	8*1+$a, $acc2
   2549 	lea	$bias+$a, $a_ptr
   2550 	mov	8*2+$a, $acc3
   2551 	mov	8*3+$a, $acc4"
   2552 }
   2553 
   2554 sub load_for_sqr () {
   2555 my ($a,$src0) = @_;
   2556 my $bias = $src0 eq "%rax" ? 0 : -128;
   2557 
   2558 "	mov	8*0+$a, $src0
   2559 	mov	8*1+$a, $acc6
   2560 	lea	$bias+$a, $a_ptr
   2561 	mov	8*2+$a, $acc7
   2562 	mov	8*3+$a, $acc0"
   2563 }
   2564 
   2565 									{
   2566 ########################################################################
   2567 # operate in 4-5-0-1 "name space" that matches multiplication output
   2568 #
   2569 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   2570 
   2571 $code.=<<___;
   2572 .type	__ecp_nistz256_add_toq,\@abi-omnipotent
   2573 .align	32
   2574 __ecp_nistz256_add_toq:
   2575 .cfi_startproc
   2576 	xor	$t4,$t4
   2577 	add	8*0($b_ptr), $a0
   2578 	adc	8*1($b_ptr), $a1
   2579 	 mov	$a0, $t0
   2580 	adc	8*2($b_ptr), $a2
   2581 	adc	8*3($b_ptr), $a3
   2582 	 mov	$a1, $t1
   2583 	adc	\$0, $t4
   2584 
   2585 	sub	\$-1, $a0
   2586 	 mov	$a2, $t2
   2587 	sbb	$poly1, $a1
   2588 	sbb	\$0, $a2
   2589 	 mov	$a3, $t3
   2590 	sbb	$poly3, $a3
   2591 	sbb	\$0, $t4
   2592 
   2593 	cmovc	$t0, $a0
   2594 	cmovc	$t1, $a1
   2595 	mov	$a0, 8*0($r_ptr)
   2596 	cmovc	$t2, $a2
   2597 	mov	$a1, 8*1($r_ptr)
   2598 	cmovc	$t3, $a3
   2599 	mov	$a2, 8*2($r_ptr)
   2600 	mov	$a3, 8*3($r_ptr)
   2601 
   2602 	ret
   2603 .cfi_endproc
   2604 .size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
   2605 
   2606 .type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
   2607 .align	32
   2608 __ecp_nistz256_sub_fromq:
   2609 .cfi_startproc
   2610 	sub	8*0($b_ptr), $a0
   2611 	sbb	8*1($b_ptr), $a1
   2612 	 mov	$a0, $t0
   2613 	sbb	8*2($b_ptr), $a2
   2614 	sbb	8*3($b_ptr), $a3
   2615 	 mov	$a1, $t1
   2616 	sbb	$t4, $t4
   2617 
   2618 	add	\$-1, $a0
   2619 	 mov	$a2, $t2
   2620 	adc	$poly1, $a1
   2621 	adc	\$0, $a2
   2622 	 mov	$a3, $t3
   2623 	adc	$poly3, $a3
   2624 	test	$t4, $t4
   2625 
   2626 	cmovz	$t0, $a0
   2627 	cmovz	$t1, $a1
   2628 	mov	$a0, 8*0($r_ptr)
   2629 	cmovz	$t2, $a2
   2630 	mov	$a1, 8*1($r_ptr)
   2631 	cmovz	$t3, $a3
   2632 	mov	$a2, 8*2($r_ptr)
   2633 	mov	$a3, 8*3($r_ptr)
   2634 
   2635 	ret
   2636 .cfi_endproc
   2637 .size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
   2638 
   2639 .type	__ecp_nistz256_subq,\@abi-omnipotent
   2640 .align	32
   2641 __ecp_nistz256_subq:
   2642 .cfi_startproc
   2643 	sub	$a0, $t0
   2644 	sbb	$a1, $t1
   2645 	 mov	$t0, $a0
   2646 	sbb	$a2, $t2
   2647 	sbb	$a3, $t3
   2648 	 mov	$t1, $a1
   2649 	sbb	$t4, $t4
   2650 
   2651 	add	\$-1, $t0
   2652 	 mov	$t2, $a2
   2653 	adc	$poly1, $t1
   2654 	adc	\$0, $t2
   2655 	 mov	$t3, $a3
   2656 	adc	$poly3, $t3
   2657 	test	$t4, $t4
   2658 
   2659 	cmovnz	$t0, $a0
   2660 	cmovnz	$t1, $a1
   2661 	cmovnz	$t2, $a2
   2662 	cmovnz	$t3, $a3
   2663 
   2664 	ret
   2665 .cfi_endproc
   2666 .size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
   2667 
   2668 .type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
   2669 .align	32
   2670 __ecp_nistz256_mul_by_2q:
   2671 .cfi_startproc
   2672 	xor	$t4, $t4
   2673 	add	$a0, $a0		# a0:a3+a0:a3
   2674 	adc	$a1, $a1
   2675 	 mov	$a0, $t0
   2676 	adc	$a2, $a2
   2677 	adc	$a3, $a3
   2678 	 mov	$a1, $t1
   2679 	adc	\$0, $t4
   2680 
   2681 	sub	\$-1, $a0
   2682 	 mov	$a2, $t2
   2683 	sbb	$poly1, $a1
   2684 	sbb	\$0, $a2
   2685 	 mov	$a3, $t3
   2686 	sbb	$poly3, $a3
   2687 	sbb	\$0, $t4
   2688 
   2689 	cmovc	$t0, $a0
   2690 	cmovc	$t1, $a1
   2691 	mov	$a0, 8*0($r_ptr)
   2692 	cmovc	$t2, $a2
   2693 	mov	$a1, 8*1($r_ptr)
   2694 	cmovc	$t3, $a3
   2695 	mov	$a2, 8*2($r_ptr)
   2696 	mov	$a3, 8*3($r_ptr)
   2697 
   2698 	ret
   2699 .cfi_endproc
   2700 .size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
   2701 ___
   2702 									}
   2703 sub gen_double () {
   2704     my $x = shift;
   2705     my ($src0,$sfx,$bias);
   2706     my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
   2707 
   2708     if ($x ne "x") {
   2709 	$src0 = "%rax";
   2710 	$sfx  = "";
   2711 	$bias = 0;
   2712 
   2713 $code.=<<___;
   2714 .globl	ecp_nistz256_point_double
   2715 .type	ecp_nistz256_point_double,\@function,2
   2716 .align	32
   2717 ecp_nistz256_point_double:
   2718 .cfi_startproc
   2719 ___
   2720 $code.=<<___	if ($addx);
   2721 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
   2722 	mov	8(%rcx), %rcx
   2723 	and	\$0x80100, %ecx
   2724 	cmp	\$0x80100, %ecx
   2725 	je	.Lpoint_doublex
   2726 ___
   2727     } else {
   2728 	$src0 = "%rdx";
   2729 	$sfx  = "x";
   2730 	$bias = 128;
   2731 
   2732 $code.=<<___;
   2733 .type	ecp_nistz256_point_doublex,\@function,2
   2734 .align	32
   2735 ecp_nistz256_point_doublex:
   2736 .cfi_startproc
   2737 .Lpoint_doublex:
   2738 ___
   2739     }
   2740 $code.=<<___;
   2741 	push	%rbp
   2742 .cfi_push	%rbp
   2743 	push	%rbx
   2744 .cfi_push	%rbx
   2745 	push	%r12
   2746 .cfi_push	%r12
   2747 	push	%r13
   2748 .cfi_push	%r13
   2749 	push	%r14
   2750 .cfi_push	%r14
   2751 	push	%r15
   2752 .cfi_push	%r15
   2753 	sub	\$32*5+8, %rsp
   2754 .cfi_adjust_cfa_offset	32*5+8
   2755 .Lpoint_double${x}_body:
   2756 
   2757 .Lpoint_double_shortcut$x:
   2758 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
   2759 	mov	$a_ptr, $b_ptr			# backup copy
   2760 	movdqu	0x10($a_ptr), %xmm1
   2761 	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
   2762 	 mov	0x20+8*1($a_ptr), $acc5
   2763 	 mov	0x20+8*2($a_ptr), $acc0
   2764 	 mov	0x20+8*3($a_ptr), $acc1
   2765 	 mov	.Lpoly+8*1(%rip), $poly1
   2766 	 mov	.Lpoly+8*3(%rip), $poly3
   2767 	movdqa	%xmm0, $in_x(%rsp)
   2768 	movdqa	%xmm1, $in_x+0x10(%rsp)
   2769 	lea	0x20($r_ptr), $acc2
   2770 	lea	0x40($r_ptr), $acc3
   2771 	movq	$r_ptr, %xmm0
   2772 	movq	$acc2, %xmm1
   2773 	movq	$acc3, %xmm2
   2774 
   2775 	lea	$S(%rsp), $r_ptr
   2776 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
   2777 
   2778 	mov	0x40+8*0($a_ptr), $src0
   2779 	mov	0x40+8*1($a_ptr), $acc6
   2780 	mov	0x40+8*2($a_ptr), $acc7
   2781 	mov	0x40+8*3($a_ptr), $acc0
   2782 	lea	0x40-$bias($a_ptr), $a_ptr
   2783 	lea	$Zsqr(%rsp), $r_ptr
   2784 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
   2785 
   2786 	`&load_for_sqr("$S(%rsp)", "$src0")`
   2787 	lea	$S(%rsp), $r_ptr
   2788 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
   2789 
   2790 	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
   2791 	mov	0x40+8*0($b_ptr), $acc1
   2792 	mov	0x40+8*1($b_ptr), $acc2
   2793 	mov	0x40+8*2($b_ptr), $acc3
   2794 	mov	0x40+8*3($b_ptr), $acc4
   2795 	lea	0x40-$bias($b_ptr), $a_ptr
   2796 	lea	0x20($b_ptr), $b_ptr
   2797 	movq	%xmm2, $r_ptr
   2798 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
   2799 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
   2800 
   2801 	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
   2802 	mov	$in_x+8*1(%rsp), $acc5
   2803 	lea	$Zsqr(%rsp), $b_ptr
   2804 	mov	$in_x+8*2(%rsp), $acc0
   2805 	mov	$in_x+8*3(%rsp), $acc1
   2806 	lea	$M(%rsp), $r_ptr
   2807 	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
   2808 
   2809 	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
   2810 	mov	$in_x+8*1(%rsp), $acc5
   2811 	lea	$Zsqr(%rsp), $b_ptr
   2812 	mov	$in_x+8*2(%rsp), $acc0
   2813 	mov	$in_x+8*3(%rsp), $acc1
   2814 	lea	$Zsqr(%rsp), $r_ptr
   2815 	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
   2816 
   2817 	`&load_for_sqr("$S(%rsp)", "$src0")`
   2818 	movq	%xmm1, $r_ptr
   2819 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
   2820 ___
   2821 {
   2822 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
   2823 # operate in 4-5-6-7 "name space" that matches squaring output
   2824 #
   2825 my ($poly1,$poly3)=($a_ptr,$t1);
   2826 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
   2827 
   2828 $code.=<<___;
   2829 	xor	$t4, $t4
   2830 	mov	$a0, $t0
   2831 	add	\$-1, $a0
   2832 	mov	$a1, $t1
   2833 	adc	$poly1, $a1
   2834 	mov	$a2, $t2
   2835 	adc	\$0, $a2
   2836 	mov	$a3, $t3
   2837 	adc	$poly3, $a3
   2838 	adc	\$0, $t4
   2839 	xor	$a_ptr, $a_ptr		# borrow $a_ptr
   2840 	test	\$1, $t0
   2841 
   2842 	cmovz	$t0, $a0
   2843 	cmovz	$t1, $a1
   2844 	cmovz	$t2, $a2
   2845 	cmovz	$t3, $a3
   2846 	cmovz	$a_ptr, $t4
   2847 
   2848 	mov	$a1, $t0		# a0:a3>>1
   2849 	shr	\$1, $a0
   2850 	shl	\$63, $t0
   2851 	mov	$a2, $t1
   2852 	shr	\$1, $a1
   2853 	or	$t0, $a0
   2854 	shl	\$63, $t1
   2855 	mov	$a3, $t2
   2856 	shr	\$1, $a2
   2857 	or	$t1, $a1
   2858 	shl	\$63, $t2
   2859 	mov	$a0, 8*0($r_ptr)
   2860 	shr	\$1, $a3
   2861 	mov	$a1, 8*1($r_ptr)
   2862 	shl	\$63, $t4
   2863 	or	$t2, $a2
   2864 	or	$t4, $a3
   2865 	mov	$a2, 8*2($r_ptr)
   2866 	mov	$a3, 8*3($r_ptr)
   2867 ___
   2868 }
   2869 $code.=<<___;
   2870 	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
   2871 	lea	$M(%rsp), $r_ptr
   2872 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
   2873 
   2874 	lea	$tmp0(%rsp), $r_ptr
   2875 	call	__ecp_nistz256_mul_by_2$x
   2876 
   2877 	lea	$M(%rsp), $b_ptr
   2878 	lea	$M(%rsp), $r_ptr
   2879 	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
   2880 
   2881 	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
   2882 	lea	$S(%rsp), $r_ptr
   2883 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
   2884 
   2885 	lea	$tmp0(%rsp), $r_ptr
   2886 	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
   2887 
   2888 	`&load_for_sqr("$M(%rsp)", "$src0")`
   2889 	movq	%xmm0, $r_ptr
   2890 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
   2891 
   2892 	lea	$tmp0(%rsp), $b_ptr
   2893 	mov	$acc6, $acc0			# harmonize sqr output and sub input
   2894 	mov	$acc7, $acc1
   2895 	mov	$a_ptr, $poly1
   2896 	mov	$t1, $poly3
   2897 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
   2898 
   2899 	mov	$S+8*0(%rsp), $t0
   2900 	mov	$S+8*1(%rsp), $t1
   2901 	mov	$S+8*2(%rsp), $t2
   2902 	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
   2903 	lea	$S(%rsp), $r_ptr
   2904 	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
   2905 
   2906 	mov	$M(%rsp), $src0
   2907 	lea	$M(%rsp), $b_ptr
   2908 	mov	$acc4, $acc6			# harmonize sub output and mul input
   2909 	xor	%ecx, %ecx
   2910 	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
   2911 	mov	$acc5, $acc2
   2912 	mov	$acc5, $S+8*1(%rsp)
   2913 	cmovz	$acc0, $acc3
   2914 	mov	$acc0, $S+8*2(%rsp)
   2915 	lea	$S-$bias(%rsp), $a_ptr
   2916 	cmovz	$acc1, $acc4
   2917 	mov	$acc1, $S+8*3(%rsp)
   2918 	mov	$acc6, $acc1
   2919 	lea	$S(%rsp), $r_ptr
   2920 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
   2921 
   2922 	movq	%xmm1, $b_ptr
   2923 	movq	%xmm1, $r_ptr
   2924 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
   2925 
   2926 	lea	32*5+56(%rsp), %rsi
   2927 .cfi_def_cfa	%rsi,8
   2928 	mov	-48(%rsi),%r15
   2929 .cfi_restore	%r15
   2930 	mov	-40(%rsi),%r14
   2931 .cfi_restore	%r14
   2932 	mov	-32(%rsi),%r13
   2933 .cfi_restore	%r13
   2934 	mov	-24(%rsi),%r12
   2935 .cfi_restore	%r12
   2936 	mov	-16(%rsi),%rbx
   2937 .cfi_restore	%rbx
   2938 	mov	-8(%rsi),%rbp
   2939 .cfi_restore	%rbp
   2940 	lea	(%rsi),%rsp
   2941 .cfi_def_cfa_register	%rsp
   2942 .Lpoint_double${x}_epilogue:
   2943 	ret
   2944 .cfi_endproc
   2945 .size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
   2946 ___
   2947 }
   2948 &gen_double("q");
   2949 
   2950 sub gen_add () {
   2951     my $x = shift;
   2952     my ($src0,$sfx,$bias);
   2953     my ($H,$Hsqr,$R,$Rsqr,$Hcub,
   2954 	$U1,$U2,$S1,$S2,
   2955 	$res_x,$res_y,$res_z,
   2956 	$in1_x,$in1_y,$in1_z,
   2957 	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
   2958     my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
   2959 
   2960     if ($x ne "x") {
   2961 	$src0 = "%rax";
   2962 	$sfx  = "";
   2963 	$bias = 0;
   2964 
   2965 $code.=<<___;
   2966 .globl	ecp_nistz256_point_add
   2967 .type	ecp_nistz256_point_add,\@function,3
   2968 .align	32
   2969 ecp_nistz256_point_add:
   2970 .cfi_startproc
   2971 ___
   2972 $code.=<<___	if ($addx);
   2973 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
   2974 	mov	8(%rcx), %rcx
   2975 	and	\$0x80100, %ecx
   2976 	cmp	\$0x80100, %ecx
   2977 	je	.Lpoint_addx
   2978 ___
   2979     } else {
   2980 	$src0 = "%rdx";
   2981 	$sfx  = "x";
   2982 	$bias = 128;
   2983 
   2984 $code.=<<___;
   2985 .type	ecp_nistz256_point_addx,\@function,3
   2986 .align	32
   2987 ecp_nistz256_point_addx:
   2988 .cfi_startproc
   2989 .Lpoint_addx:
   2990 ___
   2991     }
   2992 $code.=<<___;
   2993 	push	%rbp
   2994 .cfi_push	%rbp
   2995 	push	%rbx
   2996 .cfi_push	%rbx
   2997 	push	%r12
   2998 .cfi_push	%r12
   2999 	push	%r13
   3000 .cfi_push	%r13
   3001 	push	%r14
   3002 .cfi_push	%r14
   3003 	push	%r15
   3004 .cfi_push	%r15
   3005 	sub	\$32*18+8, %rsp
   3006 .cfi_adjust_cfa_offset	32*18+8
   3007 .Lpoint_add${x}_body:
   3008 
   3009 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
   3010 	movdqu	0x10($a_ptr), %xmm1
   3011 	movdqu	0x20($a_ptr), %xmm2
   3012 	movdqu	0x30($a_ptr), %xmm3
   3013 	movdqu	0x40($a_ptr), %xmm4
   3014 	movdqu	0x50($a_ptr), %xmm5
   3015 	mov	$a_ptr, $b_ptr			# reassign
   3016 	mov	$b_org, $a_ptr			# reassign
   3017 	movdqa	%xmm0, $in1_x(%rsp)
   3018 	movdqa	%xmm1, $in1_x+0x10(%rsp)
   3019 	movdqa	%xmm2, $in1_y(%rsp)
   3020 	movdqa	%xmm3, $in1_y+0x10(%rsp)
   3021 	movdqa	%xmm4, $in1_z(%rsp)
   3022 	movdqa	%xmm5, $in1_z+0x10(%rsp)
   3023 	por	%xmm4, %xmm5
   3024 
   3025 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
   3026 	 pshufd	\$0xb1, %xmm5, %xmm3
   3027 	movdqu	0x10($a_ptr), %xmm1
   3028 	movdqu	0x20($a_ptr), %xmm2
   3029 	 por	%xmm3, %xmm5
   3030 	movdqu	0x30($a_ptr), %xmm3
   3031 	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
   3032 	 mov	0x40+8*1($a_ptr), $acc6
   3033 	 mov	0x40+8*2($a_ptr), $acc7
   3034 	 mov	0x40+8*3($a_ptr), $acc0
   3035 	movdqa	%xmm0, $in2_x(%rsp)
   3036 	 pshufd	\$0x1e, %xmm5, %xmm4
   3037 	movdqa	%xmm1, $in2_x+0x10(%rsp)
   3038 	movdqu	0x40($a_ptr),%xmm0		# in2_z again
   3039 	movdqu	0x50($a_ptr),%xmm1
   3040 	movdqa	%xmm2, $in2_y(%rsp)
   3041 	movdqa	%xmm3, $in2_y+0x10(%rsp)
   3042 	 por	%xmm4, %xmm5
   3043 	 pxor	%xmm4, %xmm4
   3044 	por	%xmm0, %xmm1
   3045 	 movq	$r_ptr, %xmm0			# save $r_ptr
   3046 
   3047 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
   3048 	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
   3049 	 mov	$acc6, $in2_z+8*1(%rsp)
   3050 	 mov	$acc7, $in2_z+8*2(%rsp)
   3051 	 mov	$acc0, $in2_z+8*3(%rsp)
   3052 	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
   3053 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
   3054 
   3055 	pcmpeqd	%xmm4, %xmm5
   3056 	pshufd	\$0xb1, %xmm1, %xmm4
   3057 	por	%xmm1, %xmm4
   3058 	pshufd	\$0, %xmm5, %xmm5		# in1infty
   3059 	pshufd	\$0x1e, %xmm4, %xmm3
   3060 	por	%xmm3, %xmm4
   3061 	pxor	%xmm3, %xmm3
   3062 	pcmpeqd	%xmm3, %xmm4
   3063 	pshufd	\$0, %xmm4, %xmm4		# in2infty
   3064 	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
   3065 	 mov	0x40+8*1($b_ptr), $acc6
   3066 	 mov	0x40+8*2($b_ptr), $acc7
   3067 	 mov	0x40+8*3($b_ptr), $acc0
   3068 	movq	$b_ptr, %xmm1
   3069 
   3070 	lea	0x40-$bias($b_ptr), $a_ptr
   3071 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
   3072 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
   3073 
   3074 	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
   3075 	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
   3076 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
   3077 
   3078 	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
   3079 	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
   3080 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
   3081 
   3082 	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
   3083 	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
   3084 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
   3085 
   3086 	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
   3087 	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
   3088 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
   3089 
   3090 	lea	$S1(%rsp), $b_ptr
   3091 	lea	$R(%rsp), $r_ptr		# R = S2 - S1
   3092 	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
   3093 
   3094 	or	$acc5, $acc4			# see if result is zero
   3095 	movdqa	%xmm4, %xmm2
   3096 	or	$acc0, $acc4
   3097 	or	$acc1, $acc4
   3098 	por	%xmm5, %xmm2			# in1infty || in2infty
   3099 	movq	$acc4, %xmm3
   3100 
   3101 	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
   3102 	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
   3103 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
   3104 
   3105 	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
   3106 	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
   3107 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
   3108 
   3109 	lea	$U1(%rsp), $b_ptr
   3110 	lea	$H(%rsp), $r_ptr		# H = U2 - U1
   3111 	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
   3112 
   3113 	or	$acc5, $acc4			# see if result is zero
   3114 	or	$acc0, $acc4
   3115 	or	$acc1, $acc4
   3116 
   3117 	.byte	0x3e				# predict taken
   3118 	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
   3119 	movq	%xmm2, $acc0
   3120 	movq	%xmm3, $acc1
   3121 	test	$acc0, $acc0
   3122 	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
   3123 	test	$acc1, $acc1
   3124 	jz	.Ladd_double$x			# is_equal(S1,S2)?
   3125 
   3126 	movq	%xmm0, $r_ptr			# restore $r_ptr
   3127 	pxor	%xmm0, %xmm0
   3128 	movdqu	%xmm0, 0x00($r_ptr)
   3129 	movdqu	%xmm0, 0x10($r_ptr)
   3130 	movdqu	%xmm0, 0x20($r_ptr)
   3131 	movdqu	%xmm0, 0x30($r_ptr)
   3132 	movdqu	%xmm0, 0x40($r_ptr)
   3133 	movdqu	%xmm0, 0x50($r_ptr)
   3134 	jmp	.Ladd_done$x
   3135 
   3136 .align	32
   3137 .Ladd_double$x:
   3138 	movq	%xmm1, $a_ptr			# restore $a_ptr
   3139 	movq	%xmm0, $r_ptr			# restore $r_ptr
   3140 	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
   3141 .cfi_adjust_cfa_offset	`-32*(18-5)`
   3142 	jmp	.Lpoint_double_shortcut$x
   3143 .cfi_adjust_cfa_offset	`32*(18-5)`
   3144 
   3145 .align	32
   3146 .Ladd_proceed$x:
   3147 	`&load_for_sqr("$R(%rsp)", "$src0")`
   3148 	lea	$Rsqr(%rsp), $r_ptr		# R^2
   3149 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
   3150 
   3151 	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
   3152 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   3153 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
   3154 
   3155 	`&load_for_sqr("$H(%rsp)", "$src0")`
   3156 	lea	$Hsqr(%rsp), $r_ptr		# H^2
   3157 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
   3158 
   3159 	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
   3160 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   3161 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
   3162 
   3163 	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
   3164 	lea	$Hcub(%rsp), $r_ptr		# H^3
   3165 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
   3166 
   3167 	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
   3168 	lea	$U2(%rsp), $r_ptr		# U1*H^2
   3169 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
   3170 ___
   3171 {
   3172 #######################################################################
   3173 # operate in 4-5-0-1 "name space" that matches multiplication output
   3174 #
   3175 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   3176 my ($poly1, $poly3)=($acc6,$acc7);
   3177 
   3178 $code.=<<___;
   3179 	#lea	$U2(%rsp), $a_ptr
   3180 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
   3181 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
   3182 
   3183 	xor	$t4, $t4
   3184 	add	$acc0, $acc0		# a0:a3+a0:a3
   3185 	lea	$Rsqr(%rsp), $a_ptr
   3186 	adc	$acc1, $acc1
   3187 	 mov	$acc0, $t0
   3188 	adc	$acc2, $acc2
   3189 	adc	$acc3, $acc3
   3190 	 mov	$acc1, $t1
   3191 	adc	\$0, $t4
   3192 
   3193 	sub	\$-1, $acc0
   3194 	 mov	$acc2, $t2
   3195 	sbb	$poly1, $acc1
   3196 	sbb	\$0, $acc2
   3197 	 mov	$acc3, $t3
   3198 	sbb	$poly3, $acc3
   3199 	sbb	\$0, $t4
   3200 
   3201 	cmovc	$t0, $acc0
   3202 	mov	8*0($a_ptr), $t0
   3203 	cmovc	$t1, $acc1
   3204 	mov	8*1($a_ptr), $t1
   3205 	cmovc	$t2, $acc2
   3206 	mov	8*2($a_ptr), $t2
   3207 	cmovc	$t3, $acc3
   3208 	mov	8*3($a_ptr), $t3
   3209 
   3210 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
   3211 
   3212 	lea	$Hcub(%rsp), $b_ptr
   3213 	lea	$res_x(%rsp), $r_ptr
   3214 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
   3215 
   3216 	mov	$U2+8*0(%rsp), $t0
   3217 	mov	$U2+8*1(%rsp), $t1
   3218 	mov	$U2+8*2(%rsp), $t2
   3219 	mov	$U2+8*3(%rsp), $t3
   3220 	lea	$res_y(%rsp), $r_ptr
   3221 
   3222 	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
   3223 
   3224 	mov	$acc0, 8*0($r_ptr)		# save the result, as
   3225 	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
   3226 	mov	$acc2, 8*2($r_ptr)
   3227 	mov	$acc3, 8*3($r_ptr)
   3228 ___
   3229 }
   3230 $code.=<<___;
   3231 	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
   3232 	lea	$S2(%rsp), $r_ptr
   3233 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
   3234 
   3235 	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
   3236 	lea	$res_y(%rsp), $r_ptr
   3237 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
   3238 
   3239 	lea	$S2(%rsp), $b_ptr
   3240 	lea	$res_y(%rsp), $r_ptr
   3241 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
   3242 
   3243 	movq	%xmm0, $r_ptr		# restore $r_ptr
   3244 
   3245 	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
   3246 	movdqa	%xmm5, %xmm1
   3247 	pandn	$res_z(%rsp), %xmm0
   3248 	movdqa	%xmm5, %xmm2
   3249 	pandn	$res_z+0x10(%rsp), %xmm1
   3250 	movdqa	%xmm5, %xmm3
   3251 	pand	$in2_z(%rsp), %xmm2
   3252 	pand	$in2_z+0x10(%rsp), %xmm3
   3253 	por	%xmm0, %xmm2
   3254 	por	%xmm1, %xmm3
   3255 
   3256 	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
   3257 	movdqa	%xmm4, %xmm1
   3258 	pandn	%xmm2, %xmm0
   3259 	movdqa	%xmm4, %xmm2
   3260 	pandn	%xmm3, %xmm1
   3261 	movdqa	%xmm4, %xmm3
   3262 	pand	$in1_z(%rsp), %xmm2
   3263 	pand	$in1_z+0x10(%rsp), %xmm3
   3264 	por	%xmm0, %xmm2
   3265 	por	%xmm1, %xmm3
   3266 	movdqu	%xmm2, 0x40($r_ptr)
   3267 	movdqu	%xmm3, 0x50($r_ptr)
   3268 
   3269 	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
   3270 	movdqa	%xmm5, %xmm1
   3271 	pandn	$res_x(%rsp), %xmm0
   3272 	movdqa	%xmm5, %xmm2
   3273 	pandn	$res_x+0x10(%rsp), %xmm1
   3274 	movdqa	%xmm5, %xmm3
   3275 	pand	$in2_x(%rsp), %xmm2
   3276 	pand	$in2_x+0x10(%rsp), %xmm3
   3277 	por	%xmm0, %xmm2
   3278 	por	%xmm1, %xmm3
   3279 
   3280 	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
   3281 	movdqa	%xmm4, %xmm1
   3282 	pandn	%xmm2, %xmm0
   3283 	movdqa	%xmm4, %xmm2
   3284 	pandn	%xmm3, %xmm1
   3285 	movdqa	%xmm4, %xmm3
   3286 	pand	$in1_x(%rsp), %xmm2
   3287 	pand	$in1_x+0x10(%rsp), %xmm3
   3288 	por	%xmm0, %xmm2
   3289 	por	%xmm1, %xmm3
   3290 	movdqu	%xmm2, 0x00($r_ptr)
   3291 	movdqu	%xmm3, 0x10($r_ptr)
   3292 
   3293 	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
   3294 	movdqa	%xmm5, %xmm1
   3295 	pandn	$res_y(%rsp), %xmm0
   3296 	movdqa	%xmm5, %xmm2
   3297 	pandn	$res_y+0x10(%rsp), %xmm1
   3298 	movdqa	%xmm5, %xmm3
   3299 	pand	$in2_y(%rsp), %xmm2
   3300 	pand	$in2_y+0x10(%rsp), %xmm3
   3301 	por	%xmm0, %xmm2
   3302 	por	%xmm1, %xmm3
   3303 
   3304 	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
   3305 	movdqa	%xmm4, %xmm1
   3306 	pandn	%xmm2, %xmm0
   3307 	movdqa	%xmm4, %xmm2
   3308 	pandn	%xmm3, %xmm1
   3309 	movdqa	%xmm4, %xmm3
   3310 	pand	$in1_y(%rsp), %xmm2
   3311 	pand	$in1_y+0x10(%rsp), %xmm3
   3312 	por	%xmm0, %xmm2
   3313 	por	%xmm1, %xmm3
   3314 	movdqu	%xmm2, 0x20($r_ptr)
   3315 	movdqu	%xmm3, 0x30($r_ptr)
   3316 
   3317 .Ladd_done$x:
   3318 	lea	32*18+56(%rsp), %rsi
   3319 .cfi_def_cfa	%rsi,8
   3320 	mov	-48(%rsi),%r15
   3321 .cfi_restore	%r15
   3322 	mov	-40(%rsi),%r14
   3323 .cfi_restore	%r14
   3324 	mov	-32(%rsi),%r13
   3325 .cfi_restore	%r13
   3326 	mov	-24(%rsi),%r12
   3327 .cfi_restore	%r12
   3328 	mov	-16(%rsi),%rbx
   3329 .cfi_restore	%rbx
   3330 	mov	-8(%rsi),%rbp
   3331 .cfi_restore	%rbp
   3332 	lea	(%rsi),%rsp
   3333 .cfi_def_cfa_register	%rsp
   3334 .Lpoint_add${x}_epilogue:
   3335 	ret
   3336 .cfi_endproc
   3337 .size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
   3338 ___
   3339 }
   3340 &gen_add("q");
   3341 
   3342 sub gen_add_affine () {
   3343     my $x = shift;
   3344     my ($src0,$sfx,$bias);
   3345     my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
   3346 	$res_x,$res_y,$res_z,
   3347 	$in1_x,$in1_y,$in1_z,
   3348 	$in2_x,$in2_y)=map(32*$_,(0..14));
   3349     my $Z1sqr = $S2;
   3350 
   3351     if ($x ne "x") {
   3352 	$src0 = "%rax";
   3353 	$sfx  = "";
   3354 	$bias = 0;
   3355 
   3356 $code.=<<___;
   3357 .globl	ecp_nistz256_point_add_affine
   3358 .type	ecp_nistz256_point_add_affine,\@function,3
   3359 .align	32
   3360 ecp_nistz256_point_add_affine:
   3361 .cfi_startproc
   3362 ___
   3363 $code.=<<___	if ($addx);
   3364 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
   3365 	mov	8(%rcx), %rcx
   3366 	and	\$0x80100, %ecx
   3367 	cmp	\$0x80100, %ecx
   3368 	je	.Lpoint_add_affinex
   3369 ___
   3370     } else {
   3371 	$src0 = "%rdx";
   3372 	$sfx  = "x";
   3373 	$bias = 128;
   3374 
   3375 $code.=<<___;
   3376 .type	ecp_nistz256_point_add_affinex,\@function,3
   3377 .align	32
   3378 ecp_nistz256_point_add_affinex:
   3379 .cfi_startproc
   3380 .Lpoint_add_affinex:
   3381 ___
   3382     }
   3383 $code.=<<___;
   3384 	push	%rbp
   3385 .cfi_push	%rbp
   3386 	push	%rbx
   3387 .cfi_push	%rbx
   3388 	push	%r12
   3389 .cfi_push	%r12
   3390 	push	%r13
   3391 .cfi_push	%r13
   3392 	push	%r14
   3393 .cfi_push	%r14
   3394 	push	%r15
   3395 .cfi_push	%r15
   3396 	sub	\$32*15+8, %rsp
   3397 .cfi_adjust_cfa_offset	32*15+8
   3398 .Ladd_affine${x}_body:
   3399 
   3400 	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
   3401 	mov	$b_org, $b_ptr		# reassign
   3402 	movdqu	0x10($a_ptr), %xmm1
   3403 	movdqu	0x20($a_ptr), %xmm2
   3404 	movdqu	0x30($a_ptr), %xmm3
   3405 	movdqu	0x40($a_ptr), %xmm4
   3406 	movdqu	0x50($a_ptr), %xmm5
   3407 	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
   3408 	 mov	0x40+8*1($a_ptr), $acc6
   3409 	 mov	0x40+8*2($a_ptr), $acc7
   3410 	 mov	0x40+8*3($a_ptr), $acc0
   3411 	movdqa	%xmm0, $in1_x(%rsp)
   3412 	movdqa	%xmm1, $in1_x+0x10(%rsp)
   3413 	movdqa	%xmm2, $in1_y(%rsp)
   3414 	movdqa	%xmm3, $in1_y+0x10(%rsp)
   3415 	movdqa	%xmm4, $in1_z(%rsp)
   3416 	movdqa	%xmm5, $in1_z+0x10(%rsp)
   3417 	por	%xmm4, %xmm5
   3418 
   3419 	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
   3420 	 pshufd	\$0xb1, %xmm5, %xmm3
   3421 	movdqu	0x10($b_ptr), %xmm1
   3422 	movdqu	0x20($b_ptr), %xmm2
   3423 	 por	%xmm3, %xmm5
   3424 	movdqu	0x30($b_ptr), %xmm3
   3425 	movdqa	%xmm0, $in2_x(%rsp)
   3426 	 pshufd	\$0x1e, %xmm5, %xmm4
   3427 	movdqa	%xmm1, $in2_x+0x10(%rsp)
   3428 	por	%xmm0, %xmm1
   3429 	 movq	$r_ptr, %xmm0		# save $r_ptr
   3430 	movdqa	%xmm2, $in2_y(%rsp)
   3431 	movdqa	%xmm3, $in2_y+0x10(%rsp)
   3432 	por	%xmm2, %xmm3
   3433 	 por	%xmm4, %xmm5
   3434 	 pxor	%xmm4, %xmm4
   3435 	por	%xmm1, %xmm3
   3436 
   3437 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
   3438 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
   3439 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
   3440 
   3441 	pcmpeqd	%xmm4, %xmm5
   3442 	pshufd	\$0xb1, %xmm3, %xmm4
   3443 	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
   3444 	 #lea	0x00($b_ptr), $b_ptr
   3445 	 mov	$acc4, $acc1			# harmonize sqr output and mul input
   3446 	por	%xmm3, %xmm4
   3447 	pshufd	\$0, %xmm5, %xmm5		# in1infty
   3448 	pshufd	\$0x1e, %xmm4, %xmm3
   3449 	 mov	$acc5, $acc2
   3450 	por	%xmm3, %xmm4
   3451 	pxor	%xmm3, %xmm3
   3452 	 mov	$acc6, $acc3
   3453 	pcmpeqd	%xmm3, %xmm4
   3454 	pshufd	\$0, %xmm4, %xmm4		# in2infty
   3455 
   3456 	lea	$Z1sqr-$bias(%rsp), $a_ptr
   3457 	mov	$acc7, $acc4
   3458 	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
   3459 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
   3460 
   3461 	lea	$in1_x(%rsp), $b_ptr
   3462 	lea	$H(%rsp), $r_ptr		# H = U2 - U1
   3463 	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
   3464 
   3465 	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
   3466 	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
   3467 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
   3468 
   3469 	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
   3470 	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
   3471 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
   3472 
   3473 	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
   3474 	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
   3475 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
   3476 
   3477 	lea	$in1_y(%rsp), $b_ptr
   3478 	lea	$R(%rsp), $r_ptr		# R = S2 - S1
   3479 	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
   3480 
   3481 	`&load_for_sqr("$H(%rsp)", "$src0")`
   3482 	lea	$Hsqr(%rsp), $r_ptr		# H^2
   3483 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
   3484 
   3485 	`&load_for_sqr("$R(%rsp)", "$src0")`
   3486 	lea	$Rsqr(%rsp), $r_ptr		# R^2
   3487 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
   3488 
   3489 	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
   3490 	lea	$Hcub(%rsp), $r_ptr		# H^3
   3491 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
   3492 
   3493 	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
   3494 	lea	$U2(%rsp), $r_ptr		# U1*H^2
   3495 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
   3496 ___
   3497 {
   3498 #######################################################################
   3499 # operate in 4-5-0-1 "name space" that matches multiplication output
   3500 #
   3501 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   3502 my ($poly1, $poly3)=($acc6,$acc7);
   3503 
   3504 $code.=<<___;
   3505 	#lea	$U2(%rsp), $a_ptr
   3506 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
   3507 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
   3508 
   3509 	xor	$t4, $t4
   3510 	add	$acc0, $acc0		# a0:a3+a0:a3
   3511 	lea	$Rsqr(%rsp), $a_ptr
   3512 	adc	$acc1, $acc1
   3513 	 mov	$acc0, $t0
   3514 	adc	$acc2, $acc2
   3515 	adc	$acc3, $acc3
   3516 	 mov	$acc1, $t1
   3517 	adc	\$0, $t4
   3518 
   3519 	sub	\$-1, $acc0
   3520 	 mov	$acc2, $t2
   3521 	sbb	$poly1, $acc1
   3522 	sbb	\$0, $acc2
   3523 	 mov	$acc3, $t3
   3524 	sbb	$poly3, $acc3
   3525 	sbb	\$0, $t4
   3526 
   3527 	cmovc	$t0, $acc0
   3528 	mov	8*0($a_ptr), $t0
   3529 	cmovc	$t1, $acc1
   3530 	mov	8*1($a_ptr), $t1
   3531 	cmovc	$t2, $acc2
   3532 	mov	8*2($a_ptr), $t2
   3533 	cmovc	$t3, $acc3
   3534 	mov	8*3($a_ptr), $t3
   3535 
   3536 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
   3537 
   3538 	lea	$Hcub(%rsp), $b_ptr
   3539 	lea	$res_x(%rsp), $r_ptr
   3540 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
   3541 
   3542 	mov	$U2+8*0(%rsp), $t0
   3543 	mov	$U2+8*1(%rsp), $t1
   3544 	mov	$U2+8*2(%rsp), $t2
   3545 	mov	$U2+8*3(%rsp), $t3
   3546 	lea	$H(%rsp), $r_ptr
   3547 
   3548 	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
   3549 
   3550 	mov	$acc0, 8*0($r_ptr)		# save the result, as
   3551 	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
   3552 	mov	$acc2, 8*2($r_ptr)
   3553 	mov	$acc3, 8*3($r_ptr)
   3554 ___
   3555 }
   3556 $code.=<<___;
   3557 	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
   3558 	lea	$S2(%rsp), $r_ptr
   3559 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
   3560 
   3561 	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
   3562 	lea	$H(%rsp), $r_ptr
   3563 	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
   3564 
   3565 	lea	$S2(%rsp), $b_ptr
   3566 	lea	$res_y(%rsp), $r_ptr
   3567 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
   3568 
   3569 	movq	%xmm0, $r_ptr		# restore $r_ptr
   3570 
   3571 	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
   3572 	movdqa	%xmm5, %xmm1
   3573 	pandn	$res_z(%rsp), %xmm0
   3574 	movdqa	%xmm5, %xmm2
   3575 	pandn	$res_z+0x10(%rsp), %xmm1
   3576 	movdqa	%xmm5, %xmm3
   3577 	pand	.LONE_mont(%rip), %xmm2
   3578 	pand	.LONE_mont+0x10(%rip), %xmm3
   3579 	por	%xmm0, %xmm2
   3580 	por	%xmm1, %xmm3
   3581 
   3582 	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
   3583 	movdqa	%xmm4, %xmm1
   3584 	pandn	%xmm2, %xmm0
   3585 	movdqa	%xmm4, %xmm2
   3586 	pandn	%xmm3, %xmm1
   3587 	movdqa	%xmm4, %xmm3
   3588 	pand	$in1_z(%rsp), %xmm2
   3589 	pand	$in1_z+0x10(%rsp), %xmm3
   3590 	por	%xmm0, %xmm2
   3591 	por	%xmm1, %xmm3
   3592 	movdqu	%xmm2, 0x40($r_ptr)
   3593 	movdqu	%xmm3, 0x50($r_ptr)
   3594 
   3595 	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
   3596 	movdqa	%xmm5, %xmm1
   3597 	pandn	$res_x(%rsp), %xmm0
   3598 	movdqa	%xmm5, %xmm2
   3599 	pandn	$res_x+0x10(%rsp), %xmm1
   3600 	movdqa	%xmm5, %xmm3
   3601 	pand	$in2_x(%rsp), %xmm2
   3602 	pand	$in2_x+0x10(%rsp), %xmm3
   3603 	por	%xmm0, %xmm2
   3604 	por	%xmm1, %xmm3
   3605 
   3606 	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
   3607 	movdqa	%xmm4, %xmm1
   3608 	pandn	%xmm2, %xmm0
   3609 	movdqa	%xmm4, %xmm2
   3610 	pandn	%xmm3, %xmm1
   3611 	movdqa	%xmm4, %xmm3
   3612 	pand	$in1_x(%rsp), %xmm2
   3613 	pand	$in1_x+0x10(%rsp), %xmm3
   3614 	por	%xmm0, %xmm2
   3615 	por	%xmm1, %xmm3
   3616 	movdqu	%xmm2, 0x00($r_ptr)
   3617 	movdqu	%xmm3, 0x10($r_ptr)
   3618 
   3619 	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
   3620 	movdqa	%xmm5, %xmm1
   3621 	pandn	$res_y(%rsp), %xmm0
   3622 	movdqa	%xmm5, %xmm2
   3623 	pandn	$res_y+0x10(%rsp), %xmm1
   3624 	movdqa	%xmm5, %xmm3
   3625 	pand	$in2_y(%rsp), %xmm2
   3626 	pand	$in2_y+0x10(%rsp), %xmm3
   3627 	por	%xmm0, %xmm2
   3628 	por	%xmm1, %xmm3
   3629 
   3630 	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
   3631 	movdqa	%xmm4, %xmm1
   3632 	pandn	%xmm2, %xmm0
   3633 	movdqa	%xmm4, %xmm2
   3634 	pandn	%xmm3, %xmm1
   3635 	movdqa	%xmm4, %xmm3
   3636 	pand	$in1_y(%rsp), %xmm2
   3637 	pand	$in1_y+0x10(%rsp), %xmm3
   3638 	por	%xmm0, %xmm2
   3639 	por	%xmm1, %xmm3
   3640 	movdqu	%xmm2, 0x20($r_ptr)
   3641 	movdqu	%xmm3, 0x30($r_ptr)
   3642 
   3643 	lea	32*15+56(%rsp), %rsi
   3644 .cfi_def_cfa	%rsi,8
   3645 	mov	-48(%rsi),%r15
   3646 .cfi_restore	%r15
   3647 	mov	-40(%rsi),%r14
   3648 .cfi_restore	%r14
   3649 	mov	-32(%rsi),%r13
   3650 .cfi_restore	%r13
   3651 	mov	-24(%rsi),%r12
   3652 .cfi_restore	%r12
   3653 	mov	-16(%rsi),%rbx
   3654 .cfi_restore	%rbx
   3655 	mov	-8(%rsi),%rbp
   3656 .cfi_restore	%rbp
   3657 	lea	(%rsi),%rsp
   3658 .cfi_def_cfa_register	%rsp
   3659 .Ladd_affine${x}_epilogue:
   3660 	ret
   3661 .cfi_endproc
   3662 .size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
   3663 ___
   3664 }
   3665 &gen_add_affine("q");
   3666 
   3667 ########################################################################
   3668 # AD*X magic
   3669 #
   3670 if ($addx) {								{
   3671 ########################################################################
   3672 # operate in 4-5-0-1 "name space" that matches multiplication output
   3673 #
   3674 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
   3675 
   3676 $code.=<<___;
   3677 .type	__ecp_nistz256_add_tox,\@abi-omnipotent
   3678 .align	32
   3679 __ecp_nistz256_add_tox:
   3680 .cfi_startproc
   3681 	xor	$t4, $t4
   3682 	adc	8*0($b_ptr), $a0
   3683 	adc	8*1($b_ptr), $a1
   3684 	 mov	$a0, $t0
   3685 	adc	8*2($b_ptr), $a2
   3686 	adc	8*3($b_ptr), $a3
   3687 	 mov	$a1, $t1
   3688 	adc	\$0, $t4
   3689 
   3690 	xor	$t3, $t3
   3691 	sbb	\$-1, $a0
   3692 	 mov	$a2, $t2
   3693 	sbb	$poly1, $a1
   3694 	sbb	\$0, $a2
   3695 	 mov	$a3, $t3
   3696 	sbb	$poly3, $a3
   3697 	sbb	\$0, $t4
   3698 
   3699 	cmovc	$t0, $a0
   3700 	cmovc	$t1, $a1
   3701 	mov	$a0, 8*0($r_ptr)
   3702 	cmovc	$t2, $a2
   3703 	mov	$a1, 8*1($r_ptr)
   3704 	cmovc	$t3, $a3
   3705 	mov	$a2, 8*2($r_ptr)
   3706 	mov	$a3, 8*3($r_ptr)
   3707 
   3708 	ret
   3709 .cfi_endproc
   3710 .size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
   3711 
   3712 .type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
   3713 .align	32
   3714 __ecp_nistz256_sub_fromx:
   3715 .cfi_startproc
   3716 	xor	$t4, $t4
   3717 	sbb	8*0($b_ptr), $a0
   3718 	sbb	8*1($b_ptr), $a1
   3719 	 mov	$a0, $t0
   3720 	sbb	8*2($b_ptr), $a2
   3721 	sbb	8*3($b_ptr), $a3
   3722 	 mov	$a1, $t1
   3723 	sbb	\$0, $t4
   3724 
   3725 	xor	$t3, $t3
   3726 	adc	\$-1, $a0
   3727 	 mov	$a2, $t2
   3728 	adc	$poly1, $a1
   3729 	adc	\$0, $a2
   3730 	 mov	$a3, $t3
   3731 	adc	$poly3, $a3
   3732 
   3733 	bt	\$0, $t4
   3734 	cmovnc	$t0, $a0
   3735 	cmovnc	$t1, $a1
   3736 	mov	$a0, 8*0($r_ptr)
   3737 	cmovnc	$t2, $a2
   3738 	mov	$a1, 8*1($r_ptr)
   3739 	cmovnc	$t3, $a3
   3740 	mov	$a2, 8*2($r_ptr)
   3741 	mov	$a3, 8*3($r_ptr)
   3742 
   3743 	ret
   3744 .cfi_endproc
   3745 .size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
   3746 
   3747 .type	__ecp_nistz256_subx,\@abi-omnipotent
   3748 .align	32
   3749 __ecp_nistz256_subx:
   3750 .cfi_startproc
   3751 	xor	$t4, $t4
   3752 	sbb	$a0, $t0
   3753 	sbb	$a1, $t1
   3754 	 mov	$t0, $a0
   3755 	sbb	$a2, $t2
   3756 	sbb	$a3, $t3
   3757 	 mov	$t1, $a1
   3758 	sbb	\$0, $t4
   3759 
   3760 	xor	$a3 ,$a3
   3761 	adc	\$-1, $t0
   3762 	 mov	$t2, $a2
   3763 	adc	$poly1, $t1
   3764 	adc	\$0, $t2
   3765 	 mov	$t3, $a3
   3766 	adc	$poly3, $t3
   3767 
   3768 	bt	\$0, $t4
   3769 	cmovc	$t0, $a0
   3770 	cmovc	$t1, $a1
   3771 	cmovc	$t2, $a2
   3772 	cmovc	$t3, $a3
   3773 
   3774 	ret
   3775 .cfi_endproc
   3776 .size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
   3777 
   3778 .type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
   3779 .align	32
   3780 __ecp_nistz256_mul_by_2x:
   3781 .cfi_startproc
   3782 	xor	$t4, $t4
   3783 	adc	$a0, $a0		# a0:a3+a0:a3
   3784 	adc	$a1, $a1
   3785 	 mov	$a0, $t0
   3786 	adc	$a2, $a2
   3787 	adc	$a3, $a3
   3788 	 mov	$a1, $t1
   3789 	adc	\$0, $t4
   3790 
   3791 	xor	$t3, $t3
   3792 	sbb	\$-1, $a0
   3793 	 mov	$a2, $t2
   3794 	sbb	$poly1, $a1
   3795 	sbb	\$0, $a2
   3796 	 mov	$a3, $t3
   3797 	sbb	$poly3, $a3
   3798 	sbb	\$0, $t4
   3799 
   3800 	cmovc	$t0, $a0
   3801 	cmovc	$t1, $a1
   3802 	mov	$a0, 8*0($r_ptr)
   3803 	cmovc	$t2, $a2
   3804 	mov	$a1, 8*1($r_ptr)
   3805 	cmovc	$t3, $a3
   3806 	mov	$a2, 8*2($r_ptr)
   3807 	mov	$a3, 8*3($r_ptr)
   3808 
   3809 	ret
   3810 .cfi_endproc
   3811 .size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
   3812 ___
   3813 									}
   3814 &gen_double("x");
   3815 &gen_add("x");
   3816 &gen_add_affine("x");
   3817 }
   3818 }}}
   3819 
   3820 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   3821 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   3822 if ($win64) {
   3823 $rec="%rcx";
   3824 $frame="%rdx";
   3825 $context="%r8";
   3826 $disp="%r9";
   3827 
   3828 $code.=<<___;
   3829 .extern	__imp_RtlVirtualUnwind
   3830 
   3831 .type	short_handler,\@abi-omnipotent
   3832 .align	16
   3833 short_handler:
   3834 	push	%rsi
   3835 	push	%rdi
   3836 	push	%rbx
   3837 	push	%rbp
   3838 	push	%r12
   3839 	push	%r13
   3840 	push	%r14
   3841 	push	%r15
   3842 	pushfq
   3843 	sub	\$64,%rsp
   3844 
   3845 	mov	120($context),%rax	# pull context->Rax
   3846 	mov	248($context),%rbx	# pull context->Rip
   3847 
   3848 	mov	8($disp),%rsi		# disp->ImageBase
   3849 	mov	56($disp),%r11		# disp->HandlerData
   3850 
   3851 	mov	0(%r11),%r10d		# HandlerData[0]
   3852 	lea	(%rsi,%r10),%r10	# end of prologue label
   3853 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   3854 	jb	.Lcommon_seh_tail
   3855 
   3856 	mov	152($context),%rax	# pull context->Rsp
   3857 
   3858 	mov	4(%r11),%r10d		# HandlerData[1]
   3859 	lea	(%rsi,%r10),%r10	# epilogue label
   3860 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   3861 	jae	.Lcommon_seh_tail
   3862 
   3863 	lea	16(%rax),%rax
   3864 
   3865 	mov	-8(%rax),%r12
   3866 	mov	-16(%rax),%r13
   3867 	mov	%r12,216($context)	# restore context->R12
   3868 	mov	%r13,224($context)	# restore context->R13
   3869 
   3870 	jmp	.Lcommon_seh_tail
   3871 .size	short_handler,.-short_handler
   3872 
   3873 .type	full_handler,\@abi-omnipotent
   3874 .align	16
   3875 full_handler:
   3876 	push	%rsi
   3877 	push	%rdi
   3878 	push	%rbx
   3879 	push	%rbp
   3880 	push	%r12
   3881 	push	%r13
   3882 	push	%r14
   3883 	push	%r15
   3884 	pushfq
   3885 	sub	\$64,%rsp
   3886 
   3887 	mov	120($context),%rax	# pull context->Rax
   3888 	mov	248($context),%rbx	# pull context->Rip
   3889 
   3890 	mov	8($disp),%rsi		# disp->ImageBase
   3891 	mov	56($disp),%r11		# disp->HandlerData
   3892 
   3893 	mov	0(%r11),%r10d		# HandlerData[0]
   3894 	lea	(%rsi,%r10),%r10	# end of prologue label
   3895 	cmp	%r10,%rbx		# context->Rip<end of prologue label
   3896 	jb	.Lcommon_seh_tail
   3897 
   3898 	mov	152($context),%rax	# pull context->Rsp
   3899 
   3900 	mov	4(%r11),%r10d		# HandlerData[1]
   3901 	lea	(%rsi,%r10),%r10	# epilogue label
   3902 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   3903 	jae	.Lcommon_seh_tail
   3904 
   3905 	mov	8(%r11),%r10d		# HandlerData[2]
   3906 	lea	(%rax,%r10),%rax
   3907 
   3908 	mov	-8(%rax),%rbp
   3909 	mov	-16(%rax),%rbx
   3910 	mov	-24(%rax),%r12
   3911 	mov	-32(%rax),%r13
   3912 	mov	-40(%rax),%r14
   3913 	mov	-48(%rax),%r15
   3914 	mov	%rbx,144($context)	# restore context->Rbx
   3915 	mov	%rbp,160($context)	# restore context->Rbp
   3916 	mov	%r12,216($context)	# restore context->R12
   3917 	mov	%r13,224($context)	# restore context->R13
   3918 	mov	%r14,232($context)	# restore context->R14
   3919 	mov	%r15,240($context)	# restore context->R15
   3920 
   3921 .Lcommon_seh_tail:
   3922 	mov	8(%rax),%rdi
   3923 	mov	16(%rax),%rsi
   3924 	mov	%rax,152($context)	# restore context->Rsp
   3925 	mov	%rsi,168($context)	# restore context->Rsi
   3926 	mov	%rdi,176($context)	# restore context->Rdi
   3927 
   3928 	mov	40($disp),%rdi		# disp->ContextRecord
   3929 	mov	$context,%rsi		# context
   3930 	mov	\$154,%ecx		# sizeof(CONTEXT)
   3931 	.long	0xa548f3fc		# cld; rep movsq
   3932 
   3933 	mov	$disp,%rsi
   3934 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   3935 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   3936 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   3937 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   3938 	mov	40(%rsi),%r10		# disp->ContextRecord
   3939 	lea	56(%rsi),%r11		# &disp->HandlerData
   3940 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   3941 	mov	%r10,32(%rsp)		# arg5
   3942 	mov	%r11,40(%rsp)		# arg6
   3943 	mov	%r12,48(%rsp)		# arg7
   3944 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   3945 	call	*__imp_RtlVirtualUnwind(%rip)
   3946 
   3947 	mov	\$1,%eax		# ExceptionContinueSearch
   3948 	add	\$64,%rsp
   3949 	popfq
   3950 	pop	%r15
   3951 	pop	%r14
   3952 	pop	%r13
   3953 	pop	%r12
   3954 	pop	%rbp
   3955 	pop	%rbx
   3956 	pop	%rdi
   3957 	pop	%rsi
   3958 	ret
   3959 .size	full_handler,.-full_handler
   3960 
   3961 .section	.pdata
   3962 .align	4
   3963 	.rva	.LSEH_begin_ecp_nistz256_neg
   3964 	.rva	.LSEH_end_ecp_nistz256_neg
   3965 	.rva	.LSEH_info_ecp_nistz256_neg
   3966 
   3967 	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
   3968 	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
   3969 	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
   3970 
   3971 	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
   3972 	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
   3973 	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
   3974 ___
   3975 $code.=<<___	if ($addx);
   3976 	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
   3977 	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
   3978 	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
   3979 
   3980 	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
   3981 	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
   3982 	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
   3983 ___
   3984 $code.=<<___;
   3985 	.rva	.LSEH_begin_ecp_nistz256_mul_mont
   3986 	.rva	.LSEH_end_ecp_nistz256_mul_mont
   3987 	.rva	.LSEH_info_ecp_nistz256_mul_mont
   3988 
   3989 	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
   3990 	.rva	.LSEH_end_ecp_nistz256_sqr_mont
   3991 	.rva	.LSEH_info_ecp_nistz256_sqr_mont
   3992 
   3993 	.rva	.LSEH_begin_ecp_nistz256_select_w5
   3994 	.rva	.LSEH_end_ecp_nistz256_select_w5
   3995 	.rva	.LSEH_info_ecp_nistz256_select_wX
   3996 
   3997 	.rva	.LSEH_begin_ecp_nistz256_select_w7
   3998 	.rva	.LSEH_end_ecp_nistz256_select_w7
   3999 	.rva	.LSEH_info_ecp_nistz256_select_wX
   4000 ___
   4001 $code.=<<___	if ($avx>1);
   4002 	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w5
   4003 	.rva	.LSEH_end_ecp_nistz256_avx2_select_w5
   4004 	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
   4005 
   4006 	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w7
   4007 	.rva	.LSEH_end_ecp_nistz256_avx2_select_w7
   4008 	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
   4009 ___
   4010 $code.=<<___;
   4011 	.rva	.LSEH_begin_ecp_nistz256_point_double
   4012 	.rva	.LSEH_end_ecp_nistz256_point_double
   4013 	.rva	.LSEH_info_ecp_nistz256_point_double
   4014 
   4015 	.rva	.LSEH_begin_ecp_nistz256_point_add
   4016 	.rva	.LSEH_end_ecp_nistz256_point_add
   4017 	.rva	.LSEH_info_ecp_nistz256_point_add
   4018 
   4019 	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
   4020 	.rva	.LSEH_end_ecp_nistz256_point_add_affine
   4021 	.rva	.LSEH_info_ecp_nistz256_point_add_affine
   4022 ___
   4023 $code.=<<___ if ($addx);
   4024 	.rva	.LSEH_begin_ecp_nistz256_point_doublex
   4025 	.rva	.LSEH_end_ecp_nistz256_point_doublex
   4026 	.rva	.LSEH_info_ecp_nistz256_point_doublex
   4027 
   4028 	.rva	.LSEH_begin_ecp_nistz256_point_addx
   4029 	.rva	.LSEH_end_ecp_nistz256_point_addx
   4030 	.rva	.LSEH_info_ecp_nistz256_point_addx
   4031 
   4032 	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
   4033 	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
   4034 	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
   4035 ___
   4036 $code.=<<___;
   4037 
   4038 .section	.xdata
   4039 .align	8
   4040 .LSEH_info_ecp_nistz256_neg:
   4041 	.byte	9,0,0,0
   4042 	.rva	short_handler
   4043 	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
   4044 .LSEH_info_ecp_nistz256_ord_mul_mont:
   4045 	.byte	9,0,0,0
   4046 	.rva	full_handler
   4047 	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
   4048 	.long	48,0
   4049 .LSEH_info_ecp_nistz256_ord_sqr_mont:
   4050 	.byte	9,0,0,0
   4051 	.rva	full_handler
   4052 	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
   4053 	.long	48,0
   4054 ___
   4055 $code.=<<___ if ($addx);
   4056 .LSEH_info_ecp_nistz256_ord_mul_montx:
   4057 	.byte	9,0,0,0
   4058 	.rva	full_handler
   4059 	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
   4060 	.long	48,0
   4061 .LSEH_info_ecp_nistz256_ord_sqr_montx:
   4062 	.byte	9,0,0,0
   4063 	.rva	full_handler
   4064 	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
   4065 	.long	48,0
   4066 ___
   4067 $code.=<<___;
   4068 .LSEH_info_ecp_nistz256_mul_mont:
   4069 	.byte	9,0,0,0
   4070 	.rva	full_handler
   4071 	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
   4072 	.long	48,0
   4073 .LSEH_info_ecp_nistz256_sqr_mont:
   4074 	.byte	9,0,0,0
   4075 	.rva	full_handler
   4076 	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
   4077 	.long	48,0
   4078 .LSEH_info_ecp_nistz256_select_wX:
   4079 	.byte	0x01,0x33,0x16,0x00
   4080 	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
   4081 	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
   4082 	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
   4083 	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
   4084 	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
   4085 	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
   4086 	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
   4087 	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
   4088 	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
   4089 	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
   4090 	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
   4091 	.align	8
   4092 ___
   4093 $code.=<<___	if ($avx>1);
   4094 .LSEH_info_ecp_nistz256_avx2_select_wX:
   4095 	.byte	0x01,0x36,0x17,0x0b
   4096 	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
   4097 	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
   4098 	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
   4099 	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
   4100 	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
   4101 	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
   4102 	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
   4103 	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
   4104 	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
   4105 	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
   4106 	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
   4107 	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
   4108 	.align	8
   4109 ___
   4110 $code.=<<___;
   4111 .LSEH_info_ecp_nistz256_point_double:
   4112 	.byte	9,0,0,0
   4113 	.rva	full_handler
   4114 	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
   4115 	.long	32*5+56,0
   4116 .LSEH_info_ecp_nistz256_point_add:
   4117 	.byte	9,0,0,0
   4118 	.rva	full_handler
   4119 	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
   4120 	.long	32*18+56,0
   4121 .LSEH_info_ecp_nistz256_point_add_affine:
   4122 	.byte	9,0,0,0
   4123 	.rva	full_handler
   4124 	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
   4125 	.long	32*15+56,0
   4126 ___
   4127 $code.=<<___ if ($addx);
   4128 .align	8
   4129 .LSEH_info_ecp_nistz256_point_doublex:
   4130 	.byte	9,0,0,0
   4131 	.rva	full_handler
   4132 	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
   4133 	.long	32*5+56,0
   4134 .LSEH_info_ecp_nistz256_point_addx:
   4135 	.byte	9,0,0,0
   4136 	.rva	full_handler
   4137 	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
   4138 	.long	32*18+56,0
   4139 .LSEH_info_ecp_nistz256_point_add_affinex:
   4140 	.byte	9,0,0,0
   4141 	.rva	full_handler
   4142 	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
   4143 	.long	32*15+56,0
   4144 ___
   4145 }
   4146 
   4147 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   4148 print $code;
   4149 close STDOUT;
   4150